diff --git a/.cmake-format.yaml b/.cmake-format.yaml index ed1370b64..ef47853ee 100644 --- a/.cmake-format.yaml +++ b/.cmake-format.yaml @@ -21,6 +21,7 @@ parse: omnitrace_add_test: flags: - SKIP_BASELINE + - SKIP_PRELOAD - SKIP_REWRITE - SKIP_RUNTIME - SKIP_SAMPLING @@ -32,16 +33,21 @@ parse: NUM_PROCS: '*' REWRITE_TIMEOUT: '*' RUNTIME_TIMEOUT: '*' + PRELOAD_TIMEOUT: '*' REWRITE_ARGS: '*' RUNTIME_ARGS: '*' RUN_ARGS: '*' ENVIRONMENT: '*' LABELS: '*' PROPERTIES: '*' + PRELOAD_PASS_REGEX: '*' + PRELOAD_FAIL_REGEX: '*' RUNTIME_PASS_REGEX: '*' RUNTIME_FAIL_REGEX: '*' REWRITE_PASS_REGEX: '*' REWRITE_FAIL_REGEX: '*' + BASELINE_PASS_REGEX: '*' + BASELINE_FAIL_REGEX: '*' REWRITE_RUN_PASS_REGEX: '*' REWRITE_RUN_FAIL_REGEX: '*' omnitrace_target_compile_definitions: diff --git a/.github/workflows/cpack.yml b/.github/workflows/cpack.yml index ed2dbbde8..99bcebeeb 100644 --- a/.github/workflows/cpack.yml +++ b/.github/workflows/cpack.yml @@ -78,7 +78,7 @@ jobs: timeout-minutes: 10 uses: actions/upload-artifact@v2 with: - name: ubuntu-stgz-installers + name: ubuntu-${{ matrix.os }}-rocm-${{ matrix.rocm-version }}-stgz-installers path: | build-release/stgz/*.sh @@ -86,7 +86,7 @@ jobs: timeout-minutes: 10 uses: actions/upload-artifact@v2 with: - name: ubuntu-deb-installers + name: ubuntu-${{ matrix.os }}-rocm-${{ matrix.rocm-version }}-deb-installers path: | build-release/deb/*.deb @@ -187,7 +187,7 @@ jobs: timeout-minutes: 10 uses: actions/upload-artifact@v2 with: - name: opensuse-stgz-installers + name: opensuse-${{ matrix.os }}-rocm-${{ matrix.rocm-version }}-stgz-installers path: | build-release/stgz/*.sh @@ -195,7 +195,7 @@ jobs: timeout-minutes: 10 uses: actions/upload-artifact@v2 with: - name: opensuse-rpm-installers + name: opensuse-${{ matrix.os }}-rocm-${{ matrix.rocm-version }}-rpm-installers path: | build-release/rpm/*.rpm diff --git a/CMakeLists.txt b/CMakeLists.txt index b8f9193cb..1a5d35ee6 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -90,8 +90,46 @@ if(DEFINED CMAKE_INSTALL_LIBDIR AND NOT DEFINED CMAKE_DEFAULT_INSTALL_LIBDIR) CACHE STRING "Object code libraries" FORCE) endif() +if(NOT "$ENV{OMNITRACE_CI}" STREQUAL "") + set(CI_BUILD $ENV{OMNITRACE_CI}) +else() + set(CI_BUILD OFF) +endif() + include(GNUInstallDirs) # install directories include(MacroUtilities) # various functions and macros + +if(CI_BUILD) + omnitrace_add_option(OMNITRACE_BUILD_CI "Enable internal asserts, etc." ON ADVANCED + NO_FEATURE) + omnitrace_add_option(OMNITRACE_BUILD_TESTING "Enable building the testing suite" ON + ADVANCED) + omnitrace_add_option(OMNITRACE_BUILD_DEBUG + "Enable building with extensive debug symbols" OFF ADVANCED) + omnitrace_add_option(OMNITRACE_CUSTOM_DATA_SOURCE "Enable custom data source" OFF + ADVANCED) + omnitrace_add_option( + OMNITRACE_BUILD_HIDDEN_VISIBILITY + "Build with hidden visibility (disable for Debug builds)" OFF ADVANCED) + omnitrace_add_option(OMNITRACE_STRIP_LIBRARIES "Strip the libraries" OFF ADVANCED) +else() + omnitrace_add_option(OMNITRACE_BUILD_CI "Enable internal asserts, etc." OFF ADVANCED + NO_FEATURE) + omnitrace_add_option(OMNITRACE_BUILD_EXAMPLES "Enable building the examples" OFF + ADVANCED) + omnitrace_add_option(OMNITRACE_BUILD_TESTING "Enable building the testing suite" OFF + ADVANCED) + omnitrace_add_option(OMNITRACE_BUILD_DEBUG + "Enable building with extensive debug symbols" OFF ADVANCED) + omnitrace_add_option(OMNITRACE_CUSTOM_DATA_SOURCE "Enable custom data source" OFF + ADVANCED) + omnitrace_add_option( + OMNITRACE_BUILD_HIDDEN_VISIBILITY + "Build with hidden visibility (disable for Debug builds)" ON ADVANCED) + omnitrace_add_option(OMNITRACE_STRIP_LIBRARIES "Strip the libraries" + ${_STRIP_LIBRARIES_DEFAULT} ADVANCED) +endif() + include(Compilers) # compiler identification include(BuildSettings) # compiler flags @@ -135,22 +173,8 @@ omnitrace_add_option(OMNITRACE_USE_OMPT "Enable OpenMP tools support" ON) omnitrace_add_option(OMNITRACE_USE_PYTHON "Enable Python support" OFF) omnitrace_add_option(OMNITRACE_BUILD_DYNINST "Build dyninst from submodule" OFF) omnitrace_add_option(OMNITRACE_BUILD_LIBUNWIND "Build libunwind from submodule" ON) -omnitrace_add_option(OMNITRACE_BUILD_EXAMPLES "Enable building the examples" OFF ADVANCED) -omnitrace_add_option(OMNITRACE_BUILD_TESTING "Enable building the testing suite" OFF - ADVANCED) -omnitrace_add_option(OMNITRACE_BUILD_DEBUG "Enable building with extensive debug symbols" - OFF ADVANCED) -omnitrace_add_option(OMNITRACE_CUSTOM_DATA_SOURCE "Enable custom data source" OFF - ADVANCED) -omnitrace_add_option( - OMNITRACE_BUILD_HIDDEN_VISIBILITY - "Build with hidden visibility (disable for Debug builds)" ON ADVANCED) -omnitrace_add_option(OMNITRACE_BUILD_CI "Enable internal asserts, etc." OFF ADVANCED - NO_FEATURE) omnitrace_add_option(OMNITRACE_INSTALL_PERFETTO_TOOLS "Install perfetto tools (i.e. traced, perfetto, etc.)" OFF) -omnitrace_add_option(OMNITRACE_STRIP_LIBRARIES "Strip the libraries" - ${_STRIP_LIBRARIES_DEFAULT} ADVANCED) if(OMNITRACE_USE_PAPI) omnitrace_add_option(OMNITRACE_BUILD_PAPI "Build PAPI from submodule" ON) @@ -161,16 +185,6 @@ if(OMNITRACE_USE_PYTHON) "Build python bindings with internal pybind11" ON) endif() -if(NOT "$ENV{OMNITRACE_CI}" STREQUAL "") - message( - AUTHOR_WARNING - "OMNITRACE_CI environment variable ($ENV{OMNITRACE_CI}) is overridding the OMNITRACE_BUILD_CI cache value" - ) - set(OMNITRACE_BUILD_CI - "$ENV{OMNITRACE_CI}" - CACHE BOOL "Enable internal asserts, etc" FORCE) -endif() - if(NOT OMNITRACE_USE_HIP) set(OMNITRACE_USE_ROCTRACER OFF diff --git a/cmake/BuildSettings.cmake b/cmake/BuildSettings.cmake index d5dc9f016..286f50d2b 100644 --- a/cmake/BuildSettings.cmake +++ b/cmake/BuildSettings.cmake @@ -12,8 +12,9 @@ include(Compilers) include(FindPackageHandleStandardArgs) include(MacroUtilities) -omnitrace_add_option(OMNITRACE_BUILD_DEVELOPER - "Extra build flags for development like -Werror" OFF) +omnitrace_add_option( + OMNITRACE_BUILD_DEVELOPER "Extra build flags for development like -Werror" + ${OMNITRACE_BUILD_CI}) omnitrace_add_option(OMNITRACE_BUILD_EXTRA_OPTIMIZATIONS "Extra optimization flags" OFF) omnitrace_add_option(OMNITRACE_BUILD_LTO "Build with link-time optimization" OFF) omnitrace_add_option(OMNITRACE_USE_COMPILE_TIMING diff --git a/external/timemory b/external/timemory index a781a2169..98e2306ca 160000 --- a/external/timemory +++ b/external/timemory @@ -1 +1 @@ -Subproject commit a781a2169589e375db220f6e615d26714d8b5ba7 +Subproject commit 98e2306ca9226226013335637ce6c33f72bf1e3a diff --git a/source/bin/CMakeLists.txt b/source/bin/CMakeLists.txt index 0d64fcadb..c00f45b99 100644 --- a/source/bin/CMakeLists.txt +++ b/source/bin/CMakeLists.txt @@ -16,6 +16,7 @@ endif() # executables add_subdirectory(omnitrace-avail) add_subdirectory(omnitrace-critical-trace) +add_subdirectory(omnitrace-sample) add_subdirectory(omnitrace) if(OMNITRACE_BUILD_TESTING OR "$ENV{OMNITRACE_CI}" MATCHES "[1-9]+|ON|on|y|yes") diff --git a/source/bin/omnitrace-sample/CMakeLists.txt b/source/bin/omnitrace-sample/CMakeLists.txt new file mode 100644 index 000000000..e502d4a26 --- /dev/null +++ b/source/bin/omnitrace-sample/CMakeLists.txt @@ -0,0 +1,22 @@ +# ------------------------------------------------------------------------------# +# +# omnitrace-sample target +# +# ------------------------------------------------------------------------------# + +add_executable(omnitrace-sample ${CMAKE_CURRENT_LIST_DIR}/omnitrace-sample.cpp + ${CMAKE_CURRENT_LIST_DIR}/impl.cpp) + +target_include_directories(omnitrace-sample PRIVATE ${CMAKE_CURRENT_LIST_DIR}) +target_link_libraries( + omnitrace-sample + PRIVATE omnitrace::omnitrace-compile-definitions omnitrace::omnitrace-headers + omnitrace::omnitrace-common-library) +set_target_properties( + omnitrace-sample PROPERTIES BUILD_RPATH "\$ORIGIN:\$ORIGIN/../${CMAKE_INSTALL_LIBDIR}" + INSTALL_RPATH "${OMNITRACE_EXE_INSTALL_RPATH}") + +install( + TARGETS omnitrace-sample + DESTINATION ${CMAKE_INSTALL_BINDIR} + OPTIONAL) diff --git a/source/bin/omnitrace-sample/impl.cpp b/source/bin/omnitrace-sample/impl.cpp new file mode 100644 index 000000000..4d235ac3d --- /dev/null +++ b/source/bin/omnitrace-sample/impl.cpp @@ -0,0 +1,676 @@ +// MIT License +// +// Copyright (c) 2022 Advanced Micro Devices, Inc. All Rights Reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. + +#include "omnitrace-sample.hpp" + +#include "common/delimit.hpp" +#include "common/environment.hpp" +#include "common/join.hpp" +#include "common/setup.hpp" + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace color = tim::log::color; +using tim::log::stream; +using namespace timemory::join; +using tim::get_env; + +namespace +{ +int verbose = 0; +} + +std::string +get_command(const char* _argv0) +{ + return omnitrace::path::find_path(_argv0, 0, omnitrace::common::get_env("PATH", "")); +} + +std::string +get_realpath(const std::string& _v) +{ + auto* _tmp = realpath(_v.c_str(), nullptr); + auto _ret = std::string{ _tmp }; + free(_tmp); + return _ret; +} + +void +print_command(const std::vector& _argv) +{ + if(verbose >= 1) + stream(std::cout, color::info()) + << "Executing '" << join(array_config{ " " }, _argv) << "'...\n"; +} + +std::vector +get_initial_environment() +{ + std::vector _env; + if(environ != nullptr) + { + int idx = 0; + while(environ[idx] != nullptr) + _env.emplace_back(strdup(environ[idx++])); + } + + update_env(_env, "LD_PRELOAD", + get_realpath(get_internal_libpath("libomnitrace-dl.so")), true); + + auto* _dl_libpath = + realpath(get_internal_libpath("libomnitrace-dl.so").c_str(), nullptr); + auto* _omni_libpath = + realpath(get_internal_libpath("libomnitrace.so").c_str(), nullptr); + + update_env(_env, "OMNITRACE_USE_SAMPLING", true); + update_env(_env, "OMNITRACE_CRITICAL_TRACE", false); + update_env(_env, "OMNITRACE_USE_PROCESS_SAMPLING", false); + // update_env(_env, "OMNITRACE_USE_PID", false); + // update_env(_env, "OMNITRACE_TIME_OUTPUT", false); + // update_env(_env, "OMNITRACE_OUTPUT_PATH", "omnitrace-output/%tag%/%launch_time%"); + +#if defined(OMNITRACE_USE_ROCTRACER) || defined(OMNITRACE_USE_ROCPROFILER) + update_env(_env, "HSA_TOOLS_LIB", _dl_libpath); + update_env(_env, "HSA_TOOLS_REPORT_LOAD_FAILURE", "1"); +#endif + +#if defined(OMNITRACE_USE_ROCPROFILER) + update_env(_env, "ROCP_TOOL_LIB", _omni_libpath); + update_env(_env, "ROCP_HSA_INTERCEPT", "1"); +#endif + +#if defined(OMNITRACE_USE_OMPT) + update_env(_env, "OMP_TOOL_LIBRARIES", _dl_libpath); +#endif + + free(_dl_libpath); + free(_omni_libpath); + + return _env; +} + +std::string +get_internal_libpath(const std::string& _lib) +{ + auto _exe = std::string_view{ realpath("/proc/self/exe", nullptr) }; + auto _pos = _exe.find_last_of('/'); + auto _dir = std::string{ "./" }; + if(_pos != std::string_view::npos) _dir = _exe.substr(0, _pos); + return omnitrace::common::join("/", _dir, "..", "lib", _lib); +} + +template +void +update_env(std::vector& _environ, std::string_view _env_var, Tp&& _env_val, + bool _append) +{ + auto _key = join("", _env_var, "="); + for(auto& itr : _environ) + { + if(!itr) continue; + if(std::string_view{ itr }.find(_key) == 0) + { + if(_append) + { + auto _val = std::string{ itr }.substr(_key.length()); + free(itr); + itr = strdup( + omnitrace::common::join('=', _env_var, join(":", _env_val, _val)) + .c_str()); + } + else + { + free(itr); + itr = strdup(omnitrace::common::join('=', _env_var, _env_val).c_str()); + } + return; + } + } + _environ.emplace_back( + strdup(omnitrace::common::join('=', _env_var, _env_val).c_str())); +} + +std::vector +parse_args(int argc, char** argv, std::vector& _env) +{ + using parser_t = tim::argparse::argument_parser; + using parser_err_t = typename parser_t::result_type; + + auto help_check = [](parser_t& p, int _argc, char** _argv) { + std::set help_args = { "-h", "--help", "-?" }; + return (p.exists("help") || _argc == 1 || + (_argc > 1 && help_args.find(_argv[1]) != help_args.end())); + }; + + auto _pec = EXIT_SUCCESS; + auto help_action = [&_pec, argc, argv](parser_t& p) { + if(_pec != EXIT_SUCCESS) + { + std::stringstream msg; + msg << "Error in command:"; + for(int i = 0; i < argc; ++i) + msg << " " << argv[i]; + msg << "\n\n"; + stream(std::cerr, color::fatal()) << msg.str(); + std::cerr << std::flush; + } + + p.print_help(); + exit(_pec); + }; + + auto parser = parser_t(argv[0]); + + parser.on_error([](parser_t&, const parser_err_t& _err) { + stream(std::cerr, color::fatal()) << _err << "\n"; + exit(EXIT_FAILURE); + }); + + const auto* _cputime_desc = + R"(Sample based on a CPU-clock timer (default). Accepts zero or more arguments: + %{INDENT}%0. Enables sampling based on CPU-clock timer. + %{INDENT}%1. Interrupts per second. E.g., 100 == sample every 10 milliseconds of CPU-time. + %{INDENT}%2. Delay (in seconds of CPU-clock time). I.e., how long each thread should wait before taking first sample. + %{INDENT}%3+ Thread IDs to target for sampling, starting at 0 (the main thread). + %{INDENT}% May be specified as index or range, e.g., '0 2-4' will be interpreted as: + %{INDENT}% sample the main thread (0), do not sample the first child thread but sample the 2nd, 3rd, and 4th child threads)"; + + const auto* _realtime_desc = + R"(Sample based on a real-clock timer. Accepts zero or more arguments: + %{INDENT}%0. Enables sampling based on real-clock timer. + %{INDENT}%1. Interrupts per second. E.g., 100 == sample every 10 milliseconds of realtime. + %{INDENT}%2. Delay (in seconds of real-clock time). I.e., how long each thread should wait before taking first sample. + %{INDENT}%3+ Thread IDs to target for sampling, starting at 0 (the main thread). + %{INDENT}% May be specified as index or range, e.g., '0 2-4' will be interpreted as: + %{INDENT}% sample the main thread (0), do not sample the first child thread but sample the 2nd, 3rd, and 4th child threads + %{INDENT}% When sampling with a real-clock timer, please note that enabling this will cause threads which are typically "idle" + %{INDENT}% to consume more resources since, while idle, the real-clock time increases (and therefore triggers taking samples) + %{INDENT}% whereas the CPU-clock time does not.)"; + + const auto* _hsa_interrupt_desc = + R"(Set the value of the HSA_ENABLE_INTERRUPT environment variable. +%{INDENT}% ROCm version 5.2 and older have a bug which will cause a deadlock if a sample is taken while waiting for the signal +%{INDENT}% that a kernel completed -- which happens when sampling with a real-clock timer. We require this option to be set to +%{INDENT}% when --realtime is specified to make users aware that, while this may fix the bug, it can have a negative impact on +%{INDENT}% performance. +%{INDENT}% Values: +%{INDENT}% 0 avoid triggering the bug, potentially at the cost of reduced performance +%{INDENT}% 1 do not modify how ROCm is notified about kernel completion)"; + + auto _realtime_reqs = (get_env("HSA_ENABLE_INTERRUPT", std::string{}, false).empty()) + ? std::initializer_list{ "hsa-interrupt" } + : std::initializer_list{}; + + const auto* _trace_policy_desc = + R"(Policy for new data when the buffer size limit is reached: + %{INDENT}%- discard : new data is ignored + %{INDENT}%- ring_buffer : new data overwrites oldest data)"; + + auto _add_separator = [&](std::string _v, const std::string& _desc) { + parser.add_argument({ "" }, ""); + parser + .add_argument({ join("", "[", _v, "]") }, + (_desc.empty()) ? _desc : join({ "", "(", ")" }, _desc)) + .color(tim::log::color::info()); + parser.add_argument({ "" }, ""); + }; + + parser.enable_help(); + + auto _cols = std::get<0>(tim::utility::console::get_columns()); + if(_cols > parser.get_help_width() + 8) + parser.set_description_width( + std::min(_cols - parser.get_help_width() - 8, 120)); + + _add_separator("DEBUG OPTIONS", ""); + parser.add_argument({ "--monochrome" }, "Disable colorized output") + .max_count(1) + .dtype("bool") + .action([&](parser_t& p) { + auto _colorized = !p.get("monochrome"); + p.set_use_color(_colorized); + update_env(_env, "OMNITRACE_COLORIZED_LOG", (_colorized) ? "1" : "0"); + update_env(_env, "COLORIZED_LOG", (_colorized) ? "1" : "0"); + }); + parser.add_argument({ "--debug" }, "Debug output") + .max_count(1) + .action([&](parser_t& p) { + update_env(_env, "OMNITRACE_DEBUG", p.get("debug")); + }); + parser.add_argument({ "-v", "--verbose" }, "Verbose output") + .count(1) + .action([&](parser_t& p) { + auto _v = p.get("verbose"); + verbose = _v; + update_env(_env, "OMNITRACE_VERBOSE", _v); + }); + + _add_separator("GENERAL OPTIONS", ""); + parser.add_argument({ "-c", "--config" }, "Configuration file") + .min_count(0) + .dtype("filepath") + .action([&](parser_t& p) { + update_env( + _env, "OMNITRACE_CONFIG_FILE", + join(array_config{ ":" }, p.get>("config"))); + }); + parser + .add_argument({ "-o", "--output" }, + "Output path. Accepts 1-2 parameters corresponding to the output " + "path and the output prefix") + .min_count(1) + .max_count(2) + .action([&](parser_t& p) { + auto _v = p.get>("output"); + update_env(_env, "OMNITRACE_OUTPUT_PATH", _v.at(0)); + if(_v.size() > 1) update_env(_env, "OMNITRACE_OUTPUT_PREFIX", _v.at(1)); + }); + parser + .add_argument({ "-T", "--trace" }, "Generate a detailed trace (perfetto output)") + .max_count(1) + .action([&](parser_t& p) { + update_env(_env, "OMNITRACE_USE_PERFETTO", p.get("trace")); + }); + parser + .add_argument( + { "-P", "--profile" }, + "Generate a call-stack-based profile (conflicts with --flat-profile)") + .max_count(1) + .conflicts({ "flat-profile" }) + .action([&](parser_t& p) { + update_env(_env, "OMNITRACE_USE_TIMEMORY", p.get("profile")); + }); + parser + .add_argument({ "-F", "--flat-profile" }, + "Generate a flat profile (conflicts with --profile)") + .max_count(1) + .conflicts({ "profile" }) + .action([&](parser_t& p) { + update_env(_env, "OMNITRACE_USE_TIMEMORY", p.get("flat-profile")); + update_env(_env, "OMNITRACE_FLAT_PROFILE", p.get("flat-profile")); + }); + parser + .add_argument({ "-H", "--host" }, + "Enable sampling host-based metrics for the process. E.g. CPU " + "frequency, memory usage, etc.") + .max_count(1) + .action([&](parser_t& p) { + auto _h = p.get("host"); + auto _d = p.get("device"); + update_env(_env, "OMNITRACE_USE_PROCESS_SAMPLING", _h || _d); + update_env(_env, "OMNITRACE_CPU_FREQ_ENABLED", _h); + }); + parser + .add_argument({ "-D", "--device" }, + "Enable sampling device-based metrics for the process. E.g. GPU " + "temperature, memory usage, etc.") + .max_count(1) + .action([&](parser_t& p) { + auto _h = p.get("host"); + auto _d = p.get("device"); + update_env(_env, "OMNITRACE_USE_PROCESS_SAMPLING", _h || _d); + update_env(_env, "OMNITRACE_USE_ROCM_SMI", _d); + }); + + _add_separator("TRACING OPTIONS", ""); + parser + .add_argument({ "--trace-file" }, + "Specify the trace output filename. Relative filepath will be with " + "respect to output path and output prefix.") + .count(1) + .dtype("filepath") + .action([&](parser_t& p) { + update_env(_env, "OMNITRACE_PERFETTO_FILE", p.get("trace-file")); + }); + parser + .add_argument({ "--trace-buffer-size" }, + "Size limit for the trace output (in KB)") + .count(1) + .dtype("KB") + .action([&](parser_t& p) { + update_env(_env, "OMNITRACE_PERFETTO_BUFFER_SIZE_KB", + p.get("trace-buffer-size")); + }); + parser.add_argument({ "--trace-fill-policy" }, _trace_policy_desc) + .count(1) + .choices({ "discard", "ring_buffer" }) + .action([&](parser_t& p) { + update_env(_env, "OMNITRACE_PERFETTO_FILL_POLICY", + p.get("trace-fill-policy")); + }); + + _add_separator("PROFILE OPTIONS", ""); + parser.add_argument({ "--profile-format" }, "Data formats for profiling results") + .min_count(1) + .max_count(3) + .requires({ "profile|flat-profile" }) + .choices({ "text", "json", "console" }) + .action([&](parser_t& p) { + auto _v = p.get>("profile"); + update_env(_env, "OMNITRACE_USE_TIMEMORY", true); + if(!_v.empty()) + { + update_env(_env, "OMNITRACE_TEXT_OUTPUT", _v.count("text") != 0); + update_env(_env, "OMNITRACE_JSON_OUTPUT", _v.count("json") != 0); + update_env(_env, "OMNITRACE_COUT_OUTPUT", _v.count("console") != 0); + } + }); + + parser + .add_argument({ "--profile-diff" }, + "Generate a diff output b/t the profile collected and an existing " + "profile from another run Accepts 1-2 parameters corresponding to " + "the input path and the input prefix") + .min_count(1) + .max_count(2) + .action([&](parser_t& p) { + auto _v = p.get>("profile-diff"); + update_env(_env, "OMNITRACE_DIFF_OUTPUT", true); + update_env(_env, "OMNITRACE_INPUT_PATH", _v.at(0)); + if(_v.size() > 1) update_env(_env, "OMNITRACE_INPUT_PREFIX", _v.at(1)); + }); + + _add_separator("HOST/DEVICE (PROCESS SAMPLING) OPTIONS", ""); + parser + .add_argument({ "--process-freq" }, + "Set the default host/device sampling frequency " + "(number of interrupts per second)") + .count(1) + .action([&](parser_t& p) { + update_env(_env, "OMNITRACE_PROCESS_SAMPLING_FREQ", + p.get("process-freq")); + }); + parser + .add_argument({ "--process-wait" }, "Set the default wait time (i.e. delay) " + "before taking first host/device sample " + "(in seconds of realtime)") + .count(1) + .action([&](parser_t& p) { + update_env(_env, "OMNITRACE_PROCESS_SAMPLING_DELAY", + p.get("process-wait")); + }); + parser + .add_argument( + { "--process-duration" }, + "Set the duration of the host/device sampling (in seconds of realtime)") + .count(1) + .action([&](parser_t& p) { + update_env(_env, "OMNITRACE_SAMPLING_PROCESS_DURATION", + p.get("process-duration")); + }); + parser + .add_argument({ "--cpus" }, + "CPU IDs for frequency sampling. Supports integers and/or ranges") + .dtype("int or range") + .requires({ "host" }) + .action([&](parser_t& p) { + update_env( + _env, "OMNITRACE_PROCESS_SAMPLING_CPUS", + join(array_config{ "," }, p.get>("cpus"))); + }); + parser + .add_argument({ "--gpus" }, + "GPU IDs for SMI queries. Supports integers and/or ranges") + .dtype("int or range") + .requires({ "device" }) + .action([&](parser_t& p) { + update_env( + _env, "OMNITRACE_PROCESS_SAMPLING_GPUS", + join(array_config{ "," }, p.get>("gpus"))); + }); + + _add_separator("GENERAL SAMPLING OPTIONS", ""); + parser + .add_argument({ "-f", "--freq" }, "Set the default sampling frequency " + "(number of interrupts per second)") + .count(1) + .action([&](parser_t& p) { + update_env(_env, "OMNITRACE_SAMPLING_FREQ", p.get("freq")); + }); + parser + .add_argument( + { "-w", "--wait" }, + "Set the default wait time (i.e. delay) before taking first sample " + "(in seconds). This delay time is based on the clock of the sampler, i.e., a " + "delay of 1 second for CPU-clock sampler may not equal 1 second of realtime") + .count(1) + .action([&](parser_t& p) { + update_env(_env, "OMNITRACE_SAMPLING_DELAY", p.get("wait")); + }); + parser + .add_argument( + { "-d", "--duration" }, + "Set the duration of the sampling (in seconds of realtime). I.e., it is " + "possible (currently) to set a CPU-clock time delay that exceeds the " + "real-time duration... resulting in zero samples being taken") + .count(1) + .action([&](parser_t& p) { + update_env(_env, "OMNITRACE_SAMPLING_DURATION", p.get("duration")); + }); + parser + .add_argument({ "-t", "--tids" }, + "Specify the default thread IDs for sampling, where 0 (zero) is " + "the main thread and each thread created by the target application " + "is assigned an atomically incrementing value.") + .min_count(1) + .action([&](parser_t& p) { + update_env(_env, "OMNITRACE_SAMPLING_TIDS", + join(array_config{ ", " }, p.get>("tids"))); + }); + + _add_separator("SAMPLING TIMER OPTIONS", ""); + parser.add_argument({ "--cputime" }, _cputime_desc) + .min_count(0) + .action([&](parser_t& p) { + auto _v = p.get>("cputime"); + update_env(_env, "OMNITRACE_SAMPLING_CPUTIME", true); + if(!_v.empty()) + { + update_env(_env, "OMNITRACE_SAMPLING_CPUTIME_FREQ", _v.front()); + _v.pop_front(); + } + if(!_v.empty()) + { + update_env(_env, "OMNITRACE_SAMPLING_CPUTIME_DELAY", _v.front()); + _v.pop_front(); + } + if(!_v.empty()) + { + update_env(_env, "OMNITRACE_SAMPLING_CPUTIME_TIDS", + join(array_config{ "," }, _v)); + } + }); + + parser.add_argument({ "--realtime" }, _realtime_desc) + .min_count(0) + .requires(_realtime_reqs) + .action([&](parser_t& p) { + auto _v = p.get>("realtime"); + update_env(_env, "OMNITRACE_SAMPLING_REALTIME", true); + if(!_v.empty()) + { + update_env(_env, "OMNITRACE_SAMPLING_REALTIME_FREQ", _v.front()); + _v.pop_front(); + } + if(!_v.empty()) + { + update_env(_env, "OMNITRACE_SAMPLING_REALTIME_DELAY", _v.front()); + _v.pop_front(); + } + if(!_v.empty()) + { + update_env(_env, "OMNITRACE_SAMPLING_REALTIME_TIDS", + join(array_config{ "," }, _v)); + } + }); + + std::set _backend_choices = { "all", "kokkosp", "mpip", + "ompt", "rcclp", "rocm-smi", + "roctracer", "rocprofiler", "roctx", + "mutex-locks", "spin-locks", "rw-locks" }; + +#if !defined(OMNITRACE_USE_MPI) && !defined(OMNITRACE_USE_MPI_HEADERS) + _backend_choices.erase("mpip"); +#endif + +#if !defined(OMNITRACE_USE_OMPT) + _backend_choices.erase("ompt"); +#endif + +#if !defined(OMNITRACE_USE_RCCL) + _backend_choices.erase("rcclp"); +#endif + +#if !defined(OMNITRACE_USE_ROCM_SMI) + _backend_choices.erase("rocm-smi"); +#endif + +#if !defined(OMNITRACE_USE_ROCTRACER) + _backend_choices.erase("roctracer"); + _backend_choices.erase("roctx"); +#endif + +#if !defined(OMNITRACE_USE_ROCPROFILER) + _backend_choices.erase("rocprofiler"); +#endif + + _add_separator("BACKEND OPTIONS", "These options control region information captured " + "w/o sampling or instrumentation"); + parser.add_argument({ "-I", "--include" }, "Include data from these backends") + .choices(_backend_choices) + .action([&](parser_t& p) { + auto _v = p.get>("include"); + auto _update = [&](const auto& _opt, bool _cond) { + if(_cond || _v.count("all") > 0) update_env(_env, _opt, true); + }; + _update("OMNITRACE_USE_KOKKOSP", _v.count("kokkosp") > 0); + _update("OMNITRACE_USE_MPIP", _v.count("mpip") > 0); + _update("OMNITRACE_USE_OMPT", _v.count("ompt") > 0); + _update("OMNITRACE_USE_RCCLP", _v.count("rcclp") > 0); + _update("OMNITRACE_USE_ROCTX", _v.count("roctx") > 0); + _update("OMNITRACE_USE_ROCM_SMI", _v.count("rocm-smi") > 0); + _update("OMNITRACE_USE_ROCTRACER", _v.count("roctracer") > 0); + _update("OMNITRACE_USE_ROCPROFILER", _v.count("rocprofiler") > 0); + _update("OMNITRACE_TRACE_THREAD_LOCKS", _v.count("mutex-locks") > 0); + _update("OMNITRACE_TRACE_THREAD_RW_LOCKS", _v.count("rw-locks") > 0); + _update("OMNITRACE_TRACE_THREAD_SPIN_LOCKS", _v.count("spin-locks") > 0); + }); + + parser.add_argument({ "-E", "--exclude" }, "Exclude data from these backends") + .choices(_backend_choices) + .action([&](parser_t& p) { + auto _v = p.get>("exclude"); + auto _update = [&](const auto& _opt, bool _cond) { + if(_cond || _v.count("all") > 0) update_env(_env, _opt, false); + }; + _update("OMNITRACE_USE_KOKKOSP", _v.count("kokkosp") > 0); + _update("OMNITRACE_USE_MPIP", _v.count("mpip") > 0); + _update("OMNITRACE_USE_OMPT", _v.count("ompt") > 0); + _update("OMNITRACE_USE_RCCLP", _v.count("rcclp") > 0); + _update("OMNITRACE_USE_ROCTX", _v.count("roctx") > 0); + _update("OMNITRACE_USE_ROCM_SMI", _v.count("rocm-smi") > 0); + _update("OMNITRACE_USE_ROCTRACER", _v.count("roctracer") > 0); + _update("OMNITRACE_USE_ROCPROFILER", _v.count("rocprofiler") > 0); + _update("OMNITRACE_TRACE_THREAD_LOCKS", _v.count("mutex-locks") > 0); + _update("OMNITRACE_TRACE_THREAD_RW_LOCKS", _v.count("rw-locks") > 0); + _update("OMNITRACE_TRACE_THREAD_SPIN_LOCKS", _v.count("spin-locks") > 0); + }); + + _add_separator("HARDWARE COUNTER OPTIONS", ""); + parser + .add_argument({ "-C", "--cpu-events" }, + "Set the CPU hardware counter events to record (ref: " + "`omnitrace-avail -H -c CPU`)") + .set_default(std::set{}) + .action([&](parser_t& p) { + auto _events = + join(array_config{ "," }, p.get>("cpu-events")); + update_env(_env, "OMNITRACE_PAPI_EVENTS", _events); + }); + +#if defined(OMNITRACE_USE_ROCPROFILER) + parser + .add_argument({ "-G", "--gpu-events" }, + "Set the GPU hardware counter events to record (ref: " + "`omnitrace-avail -H -c GPU`)") + .set_default(std::set{}) + .action([&](parser_t& p) { + auto _events = + join(array_config{ "," }, p.get>("gpu-events")); + update_env(_env, "OMNITRACE_ROCM_EVENTS", _events); + }); +#endif + + _add_separator("MISCELLANEOUS OPTIONS", ""); + parser + .add_argument({ "-i", "--inlines" }, + "Include inline info in output when available") + .max_count(1) + .action([&](parser_t& p) { + update_env(_env, "OMNITRACE_SAMPLING_INCLUDE_INLINES", + p.get("inlines")); + }); + + parser.add_argument({ "--hsa-interrupt" }, _hsa_interrupt_desc) + .count(1) + .dtype("int") + .choices({ 0, 1 }) + .action([&](parser_t& p) { + update_env(_env, "HSA_ENABLE_INTERRUPT", p.get("hsa-interrupt")); + }); + + auto _args = parser.parse_known_args(argc, argv); + auto _cerr = std::get<0>(_args); + auto _cmdc = std::get<1>(_args); + auto* _cmdv = std::get<2>(_args); + + if(parser.exists("realtime") && !parser.exists("cputime")) + update_env(_env, "OMNITRACE_SAMPLING_CPUTIME", false); + if(parser.exists("profile") && parser.exists("flat-profile")) + throw std::runtime_error( + "Error! '--profile' argument conflicts with '--flat-profile' argument"); + + if(help_check(parser, _cmdc, _cmdv)) help_action(parser); + + if(_cerr) throw std::runtime_error(_cerr.what()); + + std::vector _argv = {}; + _argv.reserve(_cmdc); + for(int i = 1; i < _cmdc; ++i) + _argv.emplace_back(_cmdv[i]); + + return _argv; +} diff --git a/source/bin/omnitrace-sample/omnitrace-sample.cpp b/source/bin/omnitrace-sample/omnitrace-sample.cpp new file mode 100644 index 000000000..46ec2b691 --- /dev/null +++ b/source/bin/omnitrace-sample/omnitrace-sample.cpp @@ -0,0 +1,75 @@ +// MIT License +// +// Copyright (c) 2022 Advanced Micro Devices, Inc. All Rights Reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. + +#include "omnitrace-sample.hpp" + +#include +#include +#include +#include + +int +main(int argc, char** argv) +{ + auto _env = get_initial_environment(); + + bool _has_double_hyphen = false; + for(int i = 1; i < argc; ++i) + { + auto _arg = std::string_view{ argv[i] }; + if(_arg == "--" || _arg == "-?" || _arg == "-h" || _arg == "--help") + _has_double_hyphen = true; + } + + std::vector _argv = {}; + if(_has_double_hyphen) + { + auto _size = _env.size(); + _argv = parse_args(argc, argv, _env); + } + else + { + _argv.reserve(argc); + for(int i = 1; i < argc; ++i) + _argv.emplace_back(argv[i]); + } + + std::sort(_env.begin(), _env.end(), [](auto* _lhs, auto* _rhs) { + if(!_lhs) return false; + if(!_rhs) return true; + return std::string_view{ _lhs } < std::string_view{ _rhs }; + }); + + for(auto* itr : _env) + if(itr != nullptr && std::string_view{ itr }.find("OMNITRACE") == 0) + std::cout << itr << "\n"; + + if(!_argv.empty()) + { + std::string _argv0 = get_command(_argv[0]); + print_command(_argv); + _argv.emplace_back(nullptr); + _env.emplace_back(nullptr); + + return execve(_argv0.c_str(), _argv.data(), _env.data()); + } +} diff --git a/source/bin/omnitrace-sample/omnitrace-sample.hpp b/source/bin/omnitrace-sample/omnitrace-sample.hpp new file mode 100644 index 000000000..8581e36a4 --- /dev/null +++ b/source/bin/omnitrace-sample/omnitrace-sample.hpp @@ -0,0 +1,49 @@ +// MIT License +// +// Copyright (c) 2022 Advanced Micro Devices, Inc. All Rights Reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. + +#pragma once + +#include +#include +#include + +std::string +get_command(const char*); + +std::string +get_realpath(const std::string&); + +void +print_command(const std::vector& _argv); + +std::vector +get_initial_environment(); + +std::string +get_internal_libpath(const std::string& _lib); + +template +void +update_env(std::vector&, std::string_view, Tp&&, bool _append = false); + +std::vector +parse_args(int argc, char** argv, std::vector&); diff --git a/source/docs/about.md b/source/docs/about.md index 771c76928..acb475c01 100644 --- a/source/docs/about.md +++ b/source/docs/about.md @@ -6,25 +6,48 @@ :maxdepth: 4 ``` +## Overview + > ***[Omnitrace](https://github.com/AMDResearch/omnitrace) is an AMD open source research project and is not supported as part of the ROCm software stack.*** [Browse Omnitrace source code on Github](https://github.com/AMDResearch/omnitrace) -[Omnitrace](https://github.com/AMDResearch/omnitrace) is designed for both high-level and -comprehensive application tracing and profiling on both the CPU and GPU. -[Omnitrace](https://github.com/AMDResearch/omnitrace) supports both binary instrumentation -and sampling as a means of collecting various metrics. +[Omnitrace](https://github.com/AMDResearch/omnitrace) is designed for both high-level profiling and +comprehensive tracing of applications running on the CPU or the CPU+GPU via dynamic binary instrumentation, +call-stack sampling, and various other means for determining currently executing function and line information. -Visualization of the comprehensive omnitrace results can be viewed in any modern web browser by visiting [ui.perfetto.dev](https://ui.perfetto.dev/) -and loading the perfetto output (`.proto` files) produced by omnitrace. +Visualization of the comprehensive omnitrace results can be viewed in any modern web browser by visiting +[ui.perfetto.dev](https://ui.perfetto.dev/) and loading the perfetto output (`.proto` files) produced by omnitrace. Aggregated high-level results are available in text files for human consumption and JSON files for programmatic analysis. The JSON output files are compatible with the python package [hatchet](https://github.com/hatchet/hatchet) which converts -the performance data into pandas dataframes and facilitate multi-run comparisons, filtering, visualization in Jupyter notebooks, and much more. +the performance data into pandas dataframes and facilitate multi-run comparisons, filtering, visualization in Jupyter notebooks, +and much more. -[Omnitrace](https://github.com/AMDResearch/omnitrace) has two distinct configuration steps: +[Omnitrace](https://github.com/AMDResearch/omnitrace) has two distinct configuration steps when instrumenting: 1. Configuring which functions and modules are instrumented in the target binaries (i.e. executable and/or libraries) - [Instrumenting with Omnitrace](instrumenting.md) 2. Configuring what the instrumentation does happens when the instrumented binaries are executed - [Customizing Omnitrace Runtime](runtime.md) + +## Omnitrace Use Cases + +When analyzing the performance of an application, ***it is always best to NOT assume you know where the performance bottlenecks are*** +***and why they are happening.*** Omnitrace is a ***tool for the entire execution of application***. It is the sort of tool which is +ideal for *characterizing* where optimization would have the greatest impact on the end-to-end execution of the application and/or +viewing what else is happening on the system during a performance bottleneck. + +Especially when GPUs are involved, there is a tendency to assume that the quickest path to performance improvement is minimizing +the runtime of the GPU kernels. This is a highly flawed assumption: if you optimize the runtime of a kernel from 1 millisecond +to 1 microsecond (1000x speed-up) but the original application *never spent time waiting* for kernel(s) to complete, +you will see zero statistically significant speed-up in end-to-end runtime of your application. In other words, it does not matter +how fast or slow the code on GPU is if the application is not bottlenecked waiting on the GPU. + +Use Omnitrace to obtain a high-level view of the entire application. Use it to determine where the performance bottlenecks are and +obtain clues to why these bottlenecks are happening. If you want ***extensive*** insight into the execution of individual kernels +on the GPU, AMD Research is working on another tool for this but you should start with the tool which characterizes the +broad picture: Omnitrace. + +With regard to the CPU, Omnitrace does not target any specific vendor, it works just as well with non-AMD CPUs as with AMD CPUs. +With regard to the GPU, Omnitrace is currently restricted to the HIP and HSA APIs and kernels executing on AMD GPUs. diff --git a/source/docs/runtime.md b/source/docs/runtime.md index 84795fabe..71ce68185 100644 --- a/source/docs/runtime.md +++ b/source/docs/runtime.md @@ -248,7 +248,7 @@ OMNITRACE_STRICT_CONFIG = true OMNITRACE_SUPPRESS_CONFIG = true OMNITRACE_SUPPRESS_PARSING = true OMNITRACE_TEXT_OUTPUT = true -OMNITRACE_TIME_FORMAT = %F_%I.%M_%p +OMNITRACE_TIME_FORMAT = %F_%H.%M OMNITRACE_TIMELINE_PROFILE = false OMNITRACE_TIMING_PRECISION = 6 OMNITRACE_TIMING_SCIENTIFIC = false diff --git a/source/lib/omnitrace-dl/dl.cpp b/source/lib/omnitrace-dl/dl.cpp index 23fb06abc..d901b42cd 100644 --- a/source/lib/omnitrace-dl/dl.cpp +++ b/source/lib/omnitrace-dl/dl.cpp @@ -98,9 +98,9 @@ get_omnitrace_dl_env() inline bool get_omnitrace_preload() { - auto&& _preload = get_env("OMNITRACE_PRELOAD", false); + auto&& _preload = get_env("OMNITRACE_PRELOAD", true); auto&& _preload_libs = get_env("LD_PRELOAD", std::string{}); - return (_preload || _preload_libs.find("libomnitrace-dl.so") != std::string::npos); + return (_preload && _preload_libs.find("libomnitrace-dl.so") != std::string::npos); } // environment priority: @@ -381,8 +381,8 @@ struct OMNITRACE_HIDDEN_API indirect // ROCP functions #if OMNITRACE_USE_ROCPROFILER > 0 - void (*rocp_on_load_tool_prop_f)(rocprofiler_settings* settings) = nullptr; - void (*rocp_on_unload_tool_f)() = nullptr; + void (*rocp_on_load_tool_prop_f)(void* settings) = nullptr; + void (*rocp_on_unload_tool_f)() = nullptr; #endif // OpenMP functions @@ -408,6 +408,8 @@ get_indirect() OMNITRACE_HIDDEN_API; indirect& get_indirect() { + omnitrace_preinit_library(); + static auto _libomni = get_env("OMNITRACE_LIBRARY", "libomnitrace.so"); static auto _libuser = get_env("OMNITRACE_USER_LIBRARY", "libomnitrace-user.so"); static auto _libdlib = get_env("OMNITRACE_DL_LIBRARY", "libomnitrace-dl.so"); @@ -503,6 +505,17 @@ namespace dl = omnitrace::dl; extern "C" { + void omnitrace_preinit_library(void) + { + if(!omnitrace::common::get_env("OMNITRACE_COLORIZED_LOG", tim::log::colorized())) + tim::log::colorized() = false; + } + + int omnitrace_preload_library(void) + { + return (::omnitrace::dl::get_omnitrace_preload()) ? 1 : 0; + } + void omnitrace_init_library(void) { OMNITRACE_DL_INVOKE(get_indirect().omnitrace_init_library_f); @@ -873,9 +886,15 @@ extern "C" // //----------------------------------------------------------------------------------// -#if OMNITRACE_USE_ROCTRACER > 0 - void OnLoadToolProp(rocprofiler_settings* settings) - { +#if OMNITRACE_USE_ROCPROFILER > 0 + void OnLoadToolProp(void* settings) + { + OMNITRACE_DL_LOG(-16, + "invoking %s(rocprofiler_settings_t*) within omnitrace-dl.so " + "will cause a silent failure for rocprofiler. ROCP_TOOL_LIB " + "should be set to libomnitrace.so\n", + __FUNCTION__); + abort(); return OMNITRACE_DL_INVOKE(get_indirect().rocp_on_load_tool_prop_f, settings); } @@ -913,16 +932,16 @@ omnitrace_preload() OMNITRACE_HIDDEN_API; bool omnitrace_preload() { - auto _preloaded = get_omnitrace_preload(); - auto _enabled = get_env("OMNITRACE_ENABLED", true); + auto _preload = get_omnitrace_preload() && get_env("OMNITRACE_ENABLED", true); static bool _once = false; - if(_once) return _preloaded; + if(_once) return _preload; _once = true; - if(_preloaded && _enabled) + if(_preload) { - OMNITRACE_DL_LOG(0, "[%s] invoking %s(%s)\n", __FUNCTION__, "omnitrace_init", + omnitrace_preinit_library(); + OMNITRACE_DL_LOG(1, "[%s] invoking %s(%s)\n", __FUNCTION__, "omnitrace_init", ::omnitrace::join(::omnitrace::QuoteStrings{}, ", ", "sampling", false, "main") .c_str()); @@ -930,7 +949,7 @@ omnitrace_preload() omnitrace_init_tooling(); } - return _preloaded; + return _preload; } bool _handle_preload = omnitrace::dl::omnitrace_preload(); diff --git a/source/lib/omnitrace-dl/dl.hpp b/source/lib/omnitrace-dl/dl.hpp index 373a94b4f..c3e4b5aa3 100644 --- a/source/lib/omnitrace-dl/dl.hpp +++ b/source/lib/omnitrace-dl/dl.hpp @@ -87,6 +87,9 @@ extern "C" size_t address) OMNITRACE_PUBLIC_API; #if defined(OMNITRACE_DL_SOURCE) && (OMNITRACE_DL_SOURCE > 0) + void omnitrace_preinit_library(void) OMNITRACE_HIDDEN_API; + int omnitrace_preload_library(void) OMNITRACE_HIDDEN_API; + int omnitrace_user_start_trace_dl(void) OMNITRACE_HIDDEN_API; int omnitrace_user_stop_trace_dl(void) OMNITRACE_HIDDEN_API; @@ -164,8 +167,7 @@ extern "C" # if OMNITRACE_USE_ROCPROFILER > 0 // ROCP - struct rocprofiler_settings; - void OnLoadToolProp(rocprofiler_settings* settings) OMNITRACE_PUBLIC_API; + void OnLoadToolProp(void* settings) OMNITRACE_PUBLIC_API; void OnUnloadTool() OMNITRACE_PUBLIC_API; # endif #endif diff --git a/source/lib/omnitrace-dl/main.c b/source/lib/omnitrace-dl/main.c index 0ded1bfb1..c82ebe8cb 100644 --- a/source/lib/omnitrace-dl/main.c +++ b/source/lib/omnitrace-dl/main.c @@ -29,6 +29,9 @@ #include #include +extern int +omnitrace_preload_library(void); + extern void omnitrace_finalize(void); @@ -84,6 +87,8 @@ __libc_start_main(int (*_main)(int, char**, char**), int _argc, char** _argv, int (*_init)(int, char**, char**), void (*_fini)(void), void (*_rtld_fini)(void), void* _stack_end) { + int _preload = omnitrace_preload_library(); + // prevent re-entry static int _reentry = 0; if(_reentry > 0) return -1; @@ -98,14 +103,23 @@ __libc_start_main(int (*_main)(int, char**, char**), int _argc, char** _argv, // Find the real __libc_start_main() omnitrace_libc_start_main user_main = dlsym(RTLD_NEXT, "__libc_start_main"); + // disable future LD_PRELOADs + setenv("OMNITRACE_PRELOAD", "0", 1); + if(user_main && user_main != _this_func) { - //if(strcmp(_argv[0], "mpirun") == 0) - // return user_main(_main, _argc, _argv, _init, _fini, _rtld_fini, - // _stack_end); - //else + if(_preload == 0) + { + // call original main + return user_main(main_real, _argc, _argv, _init, _fini, _rtld_fini, + _stack_end); + } + else + { // call omnitrace main function wrapper - return user_main(omnitrace_main, _argc, _argv, _init, _fini, _rtld_fini, _stack_end); + return user_main(omnitrace_main, _argc, _argv, _init, _fini, _rtld_fini, + _stack_end); + } } else { diff --git a/source/lib/omnitrace/library.cpp b/source/lib/omnitrace/library.cpp index 149721dd4..e739b058a 100644 --- a/source/lib/omnitrace/library.cpp +++ b/source/lib/omnitrace/library.cpp @@ -20,6 +20,10 @@ // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. +// clang-format off +#include +// clang-format on + #include "api.hpp" #include "common/setup.hpp" #include "library/components/category_region.hpp" @@ -331,6 +335,8 @@ omnitrace_init_library_hidden() extern "C" bool omnitrace_init_tooling_hidden() { + if(!get_env("OMNITRACE_COLORIZED_LOG", true, false)) tim::log::colorized() = false; + if(!tim::get_env("OMNITRACE_INIT_TOOLING", true)) { omnitrace_init_library_hidden(); @@ -604,8 +610,14 @@ omnitrace_finalize_hidden(void) auto& _thread_bundle = thread_data::instance(); if(_thread_bundle) _thread_bundle->stop(); - if(dmp::rank() == 0 && get_verbose() >= 0) fprintf(stderr, "\n"); - if(get_verbose() > 0 || get_debug()) config::print_settings(); + if(get_verbose() >= 1 || get_debug()) + { + if(dmp::rank() == 0) + { + fprintf(stderr, "\n"); + config::print_settings(); + } + } OMNITRACE_VERBOSE_F(1, "omnitrace_push_trace :: called %zux\n", _push_count); OMNITRACE_VERBOSE_F(1, "omnitrace_pop_trace :: called %zux\n", _pop_count); diff --git a/source/lib/omnitrace/library/components/fork_gotcha.cpp b/source/lib/omnitrace/library/components/fork_gotcha.cpp index cd4bd31f7..70ddf0466 100644 --- a/source/lib/omnitrace/library/components/fork_gotcha.cpp +++ b/source/lib/omnitrace/library/components/fork_gotcha.cpp @@ -45,6 +45,7 @@ fork_gotcha::configure() void fork_gotcha::audit(const gotcha_data_t&, audit::incoming) { + tim::set_env("OMNITRACE_PRELOAD", "0", 1); OMNITRACE_VERBOSE(1, "fork() called on PID %i (rank: %i), TID %li\n", process::get_id(), dmp::rank(), threading::get_id()); OMNITRACE_BASIC_DEBUG( diff --git a/source/lib/omnitrace/library/components/numa_gotcha.cpp b/source/lib/omnitrace/library/components/numa_gotcha.cpp index f97fff147..054ed7e2b 100644 --- a/source/lib/omnitrace/library/components/numa_gotcha.cpp +++ b/source/lib/omnitrace/library/components/numa_gotcha.cpp @@ -61,6 +61,16 @@ get_numa_gotcha() void numa_gotcha::configure() { + // don't emit warnings for missing MPI functions unless debug or verbosity >= 3 + if(get_verbose_env() < 3 && !get_debug_env()) + { + for(size_t i = 0; i < numa_gotcha_t::capacity(); ++i) + { + auto* itr = numa_gotcha_t::at(i); + if(itr) itr->verbose = -1; + } + } + numa_gotcha_t::get_initializer() = []() { numa_gotcha_t::configure<0, long, void*, unsigned long, int, const unsigned long*, unsigned long, unsigned>("mbind"); diff --git a/source/lib/omnitrace/library/components/pthread_create_gotcha.cpp b/source/lib/omnitrace/library/components/pthread_create_gotcha.cpp index b0c8829ea..2d09ae3d5 100644 --- a/source/lib/omnitrace/library/components/pthread_create_gotcha.cpp +++ b/source/lib/omnitrace/library/components/pthread_create_gotcha.cpp @@ -289,11 +289,18 @@ pthread_create_gotcha::shutdown() bundles->clear(); - OMNITRACE_BASIC_VERBOSE( - 1, - // 2 && _ndangling > 0, - "[pthread_create_gotcha::shutdown] cleaned up %lu dangling bundles\n", - _ndangling); + if(config::settings_are_configured()) + { + OMNITRACE_VERBOSE(2 && _ndangling > 0, + "[pthread_create_gotcha] cleaned up %lu dangling bundles\n", + _ndangling); + } + else + { + OMNITRACE_BASIC_VERBOSE( + 2 && _ndangling > 0, + "[pthread_create_gotcha] cleaned up %lu dangling bundles\n", _ndangling); + } } void diff --git a/source/lib/omnitrace/library/components/rocprofiler.cpp b/source/lib/omnitrace/library/components/rocprofiler.cpp index 96fba6085..cf3d54235 100644 --- a/source/lib/omnitrace/library/components/rocprofiler.cpp +++ b/source/lib/omnitrace/library/components/rocprofiler.cpp @@ -94,8 +94,7 @@ rocm_event::rocm_event(uint32_t _dev, uint32_t _thr, uint32_t _queue, feature_values.emplace_back(rocm_feature_value{ p->data.result_int32 }); break; case ROCPROFILER_DATA_KIND_FLOAT: - feature_values.emplace_back( - rocm_feature_value{ static_cast(p->data.result_float) }); + feature_values.emplace_back(rocm_feature_value{ p->data.result_float }); break; case ROCPROFILER_DATA_KIND_DOUBLE: feature_values.emplace_back(rocm_feature_value{ p->data.result_double }); diff --git a/source/lib/omnitrace/library/components/rocprofiler.hpp b/source/lib/omnitrace/library/components/rocprofiler.hpp index b6afaacf9..bb3ed6f1f 100644 --- a/source/lib/omnitrace/library/components/rocprofiler.hpp +++ b/source/lib/omnitrace/library/components/rocprofiler.hpp @@ -53,7 +53,7 @@ namespace component { using rocm_metric_type = unsigned long long; using rocm_info_entry = ::tim::hardware_counters::info; -using rocm_feature_value = std::variant; +using rocm_feature_value = std::variant; struct rocm_counter { diff --git a/source/lib/omnitrace/library/config.cpp b/source/lib/omnitrace/library/config.cpp index cdca176cf..181ce7258 100644 --- a/source/lib/omnitrace/library/config.cpp +++ b/source/lib/omnitrace/library/config.cpp @@ -210,7 +210,7 @@ configure_settings(bool _init) if(_once) return; _once = true; - if(get_state() < State::Init) + if(get_is_continuous_integration() && get_state() < State::Init) { timemory_print_demangled_backtrace<64>(); OMNITRACE_THROW("config::configure_settings() called before " @@ -467,6 +467,10 @@ configure_settings(bool _init) "filter out internal routines from the sampling call-stacks", true, "sampling", "data", "advanced"); + OMNITRACE_CONFIG_SETTING(bool, "OMNITRACE_SAMPLING_INCLUDE_INLINES", + "Create entries for inlined functions when available", false, + "sampling", "data", "advanced"); + OMNITRACE_CONFIG_SETTING( bool, "OMNITRACE_SAMPLING_REALTIME", "Enable sampling frequency via a wall-clock timer on child threads. This may " @@ -812,7 +816,7 @@ configure_settings(bool _init) configure_signal_handler(); configure_disabled_settings(); - OMNITRACE_VERBOSE(1, "configuration complete\n"); + OMNITRACE_VERBOSE(2, "configuration complete\n"); } void @@ -827,8 +831,10 @@ configure_mode_settings() } else { + bool _changed = get_setting_value(_name).second != _v; OMNITRACE_VERBOSE( - 1, "[configure_mode_settings] Overriding %s to %s in %s mode...\n", + 1 && _changed, + "[configure_mode_settings] Overriding %s to %s in %s mode...\n", _name.c_str(), JOIN("", std::boolalpha, _v).c_str(), std::to_string(get_mode()).c_str()); } @@ -1812,6 +1818,13 @@ get_sampling_real_tids() static_cast&>(*_v->second).get(), "thread IDs"); } +bool +get_sampling_include_inlines() +{ + static auto _v = get_config()->find("OMNITRACE_SAMPLING_INCLUDE_INLINES"); + return static_cast&>(*_v->second).get(); +} + int64_t get_critical_trace_count() { @@ -1825,7 +1838,7 @@ get_process_sampling_freq() static auto _v = get_config()->find("OMNITRACE_PROCESS_SAMPLING_FREQ"); auto _val = std::min(static_cast&>(*_v->second).get(), 1000.0); - if(_val < 1.0e-9) return get_sampling_freq(); + if(_val < 1.0e-9) return std::min(get_sampling_freq(), 100.0); return _val; } diff --git a/source/lib/omnitrace/library/config.hpp b/source/lib/omnitrace/library/config.hpp index 4409f874c..2c32c7f0c 100644 --- a/source/lib/omnitrace/library/config.hpp +++ b/source/lib/omnitrace/library/config.hpp @@ -298,6 +298,9 @@ get_sampling_cpu_tids(); std::set get_sampling_real_tids(); +bool +get_sampling_include_inlines(); + double get_process_sampling_freq(); diff --git a/source/lib/omnitrace/library/debug.cpp b/source/lib/omnitrace/library/debug.cpp index 32c426fa9..021997115 100644 --- a/source/lib/omnitrace/library/debug.cpp +++ b/source/lib/omnitrace/library/debug.cpp @@ -78,8 +78,8 @@ FILE* get_file() { static FILE* _v = []() { - auto&& _fname = tim::get_env("OMNITRACE_LOG_FILE", ""); - tim::log::colorized() = _fname.empty(); + auto&& _fname = tim::get_env("OMNITRACE_LOG_FILE", ""); + if(!_fname.empty()) tim::log::colorized() = false; return (_fname.empty()) ? stderr : tim::filepath::fopen(_fname, "w"); }(); return _v; diff --git a/source/lib/omnitrace/library/ompt.cpp b/source/lib/omnitrace/library/ompt.cpp index d1e869823..ef6357d32 100644 --- a/source/lib/omnitrace/library/ompt.cpp +++ b/source/lib/omnitrace/library/ompt.cpp @@ -20,6 +20,10 @@ // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. +#include "api.hpp" +#include "library/common.hpp" +#include "library/config.hpp" +#include "library/debug.hpp" #include "library/defines.hpp" #include @@ -97,19 +101,37 @@ ompt_start_tool(unsigned int omp_version, const char* runtime_version) OMNITRACE_METADATA("OMP_VERSION", omp_version); OMNITRACE_METADATA("OMP_RUNTIME_VERSION", runtime_version); + if(!omnitrace::settings_are_configured()) + { + OMNITRACE_BASIC_WARNING( + 0, + "[%s] invoked before omnitrace was initialized. In instrumentation mode, " + "settings exported to the environment have not been propagated yet...\n", + __FUNCTION__); + omnitrace::configure_settings(); + } + + static bool _use_ompt = omnitrace::config::get_use_ompt(); static auto ompt_initialize = [](ompt_function_lookup_t lookup, int initial_device_num, ompt_data_t* tool_data) -> int { - TIMEMORY_PRINTF(stderr, "OpenMP-tools configuring for initial device %i\n\n", - initial_device_num); - tim::ompt::configure(lookup, initial_device_num, - tool_data); + _use_ompt = omnitrace::config::get_use_ompt(); + if(_use_ompt) + { + TIMEMORY_PRINTF(stderr, "OpenMP-tools configuring for initial device %i\n\n", + initial_device_num); + tim::ompt::configure(lookup, initial_device_num, + tool_data); + } return 1; // success }; static auto ompt_finalize = [](ompt_data_t* tool_data) { - TIMEMORY_PRINTF(stderr, "OpenMP-tools finalized\n\n"); - tim::consume_parameters(tool_data); + if(_use_ompt) + { + TIMEMORY_PRINTF(stderr, "OpenMP-tools finalized\n\n"); + tim::consume_parameters(tool_data); + } }; static auto data = ompt_start_tool_result_t{ ompt_initialize, ompt_finalize, { 0 } }; diff --git a/source/lib/omnitrace/library/rocprofiler.cpp b/source/lib/omnitrace/library/rocprofiler.cpp index f4a31d2fb..213802627 100644 --- a/source/lib/omnitrace/library/rocprofiler.cpp +++ b/source/lib/omnitrace/library/rocprofiler.cpp @@ -572,8 +572,15 @@ post_process_perfetto() { for(size_t i = 0; i < _n; ++i) { +#ifdef __GNUC__ +# pragma GCC diagnostic push +# pragma GCC diagnostic ignored "-Wdouble-promotion" +#endif auto _plus = [](auto& _lhs, auto&& _rhs) { _lhs += _rhs; }; std::visit(_plus, _values.at(i), vitr->feature_values.at(i)); +#ifdef __GNUC__ +# pragma GCC diagnostic pop +#endif } } } diff --git a/source/lib/omnitrace/library/sampling.cpp b/source/lib/omnitrace/library/sampling.cpp index 9c71e4cd0..a5a92d509 100644 --- a/source/lib/omnitrace/library/sampling.cpp +++ b/source/lib/omnitrace/library/sampling.cpp @@ -103,6 +103,9 @@ namespace sampling { namespace { +std::set +configure(bool _setup, int64_t _tid = threading::get_id()); + template void thread_sigmask(Args... _args) @@ -163,6 +166,20 @@ get_sampler_running(int64_t _tid) return _v.at(_tid); } +auto& +get_duration_disabled() +{ + static auto _v = std::atomic{ false }; + return _v; +} + +auto& +get_is_duration_thread() +{ + static thread_local auto _v = false; + return _v; +} + auto& get_duration_cv() { @@ -170,6 +187,13 @@ get_duration_cv() return _v; } +auto& +get_duration_mutex() +{ + static auto _v = std::mutex{}; + return _v; +} + auto& get_duration_thread() { @@ -177,6 +201,28 @@ get_duration_thread() return _v; } +auto +notify_duration_thread() +{ + if(get_duration_thread() && !get_is_duration_thread()) + { + std::unique_lock _lk{ get_duration_mutex(), std::defer_lock }; + if(!_lk.owns_lock()) _lk.lock(); + get_duration_cv().notify_all(); + } +} + +void +stop_duration_thread() +{ + if(get_duration_thread() && !get_is_duration_thread()) + { + notify_duration_thread(); + get_duration_thread()->join(); + get_duration_thread().reset(); + } +} + void start_duration_thread() { @@ -195,12 +241,14 @@ start_duration_thread() config::get_sampling_duration() * units::sec) }; auto _func = [_end]() { thread_info::init(true); - std::mutex _mutex{}; - bool _wait = true; + threading::set_thread_name("omni.samp.dur"); + get_is_duration_thread() = true; + bool _wait = true; while(_wait) { _wait = false; - std::unique_lock _lk{ _mutex }; + std::unique_lock _lk{ get_duration_mutex(), std::defer_lock }; + if(!_lk.owns_lock()) _lk.lock(); get_duration_cv().wait_until(_lk, _end); auto _premature = (std::chrono::steady_clock::now() < _end); auto _finalized = (get_state() == State::Finalized); @@ -218,11 +266,12 @@ start_duration_thread() } else { + get_duration_disabled().store(true); OMNITRACE_VERBOSE(1, "Sampling duration of %f seconds has elapsed. " "Shutting down sampling...\n", config::get_sampling_duration()); - shutdown(); + configure(false, 0); } } }; @@ -237,7 +286,7 @@ start_duration_thread() } std::set -configure(bool _setup, int64_t _tid = threading::get_id()) +configure(bool _setup, int64_t _tid) { const auto& _info = thread_info::get(_tid, SequentTID); auto& _sampler = sampling::get_sampler(_tid); @@ -266,6 +315,8 @@ configure(bool _setup, int64_t _tid = threading::get_id()) if(_setup && !_sampler && !_is_running && !_signal_types->empty()) { + if(get_duration_disabled()) return std::set{}; + // if this thread has an offset ID, that means it was created internally // and is probably here bc it called a function which was instrumented. // thus we should not start a sampler for it @@ -356,7 +407,8 @@ configure(bool _setup, int64_t _tid = threading::get_id()) sampling::block_signals(*_signal_types); } - get_duration_cv().notify_one(); + notify_duration_thread(); + if(_tid == 0) { // this propagates to all threads @@ -371,11 +423,7 @@ configure(bool _setup, int64_t _tid = threading::get_id()) } } - if(get_duration_thread()) - { - get_duration_thread()->join(); - get_duration_thread().reset(); - } + stop_duration_thread(); } _sampler->stop(); @@ -416,7 +464,9 @@ setup() std::set shutdown() { - return configure(false); + auto _v = configure(false); + if(utility::get_thread_index() == 0) stop_duration_thread(); + return _v; } void @@ -534,10 +584,10 @@ post_process() if(_data.empty()) { - OMNITRACE_VERBOSE( - 3 || get_debug_sampling(), - "Sampler data for thread %lu has %zu valid entries... (skipped)\n", i, - _raw_data.size()); + OMNITRACE_VERBOSE(2 || get_debug_sampling(), + "Sampler data for thread %lu has zero valid entries out of " + "%zu... (skipped)\n", + i, _raw_data.size()); continue; } @@ -619,35 +669,64 @@ post_process_perfetto(int64_t _tid, const bundle_t* _init, static std::set _static_strings{}; for(const auto& iitr : backtrace::filter_and_patch(_bt_cs->get())) { - const auto* _name = _static_strings.emplace(iitr.name).first->c_str(); - uint64_t _beg = _last_ts; - uint64_t _end = _bt_ts->get_timestamp(); + uint64_t _beg = _last_ts; + uint64_t _end = _bt_ts->get_timestamp(); if(!_thread_info->is_valid_lifetime({ _beg, _end })) continue; - tracing::push_perfetto_ts( - category::sampling{}, _name, _beg, [&](perfetto::EventContext ctx) { - tracing::add_perfetto_annotation(ctx, "begin_ns", _beg); - tracing::add_perfetto_annotation(ctx, "file", iitr.location); - tracing::add_perfetto_annotation(ctx, "pc", - _as_hex(iitr.address)); - tracing::add_perfetto_annotation(ctx, "line_address", - _as_hex(iitr.line_address)); - if(iitr.lineinfo) - { - size_t _n = 0; - for(const auto& litr : iitr.lineinfo.lines) + if(get_sampling_include_inlines() && iitr.lineinfo) + { + auto _lines = iitr.lineinfo.lines; + std::reverse(_lines.begin(), _lines.end()); + size_t _n = 0; + for(const auto& litr : _lines) + { + const auto* _name = + _static_strings.emplace(demangle(litr.name)).first->c_str(); + auto _info = JOIN(':', litr.location, litr.line); + tracing::push_perfetto_ts( + category::sampling{}, _name, _beg, + [&](perfetto::EventContext ctx) { + tracing::add_perfetto_annotation(ctx, "begin_ns", _beg); + tracing::add_perfetto_annotation(ctx, "lineinfo", _info); + tracing::add_perfetto_annotation(ctx, "inlined", + (_n++ > 0)); + }); + tracing::pop_perfetto_ts(category::sampling{}, _name, _end, + "end_ns", _end); + } + } + else + { + const auto* _name = _static_strings.emplace(iitr.name).first->c_str(); + tracing::push_perfetto_ts( + category::sampling{}, _name, _beg, + [&](perfetto::EventContext ctx) { + tracing::add_perfetto_annotation(ctx, "begin_ns", _beg); + tracing::add_perfetto_annotation(ctx, "file", iitr.location); + tracing::add_perfetto_annotation(ctx, "pc", + _as_hex(iitr.address)); + tracing::add_perfetto_annotation(ctx, "line_address", + _as_hex(iitr.line_address)); + + if(iitr.lineinfo) { - auto _label = JOIN('-', "lineinfo", _n++); - tracing::add_perfetto_annotation( - ctx, _label.c_str(), - JOIN('@', demangle(litr.name), - JOIN(':', litr.location, litr.line))); + auto _lines = iitr.lineinfo.lines; + std::reverse(_lines.begin(), _lines.end()); + size_t _n = 0; + for(const auto& litr : _lines) + { + auto _label = JOIN('-', "lineinfo", _n++); + tracing::add_perfetto_annotation( + ctx, _label.c_str(), + JOIN('@', demangle(litr.name), + JOIN(':', litr.location, litr.line))); + } } - } - }); + }); - tracing::pop_perfetto_ts(category::sampling{}, _name, _end, "end_ns", - _end); + tracing::pop_perfetto_ts(category::sampling{}, _name, _end, "end_ns", + _end); + } } _last_ts = _bt_ts->get_timestamp(); } diff --git a/source/lib/omnitrace/library/thread_info.cpp b/source/lib/omnitrace/library/thread_info.cpp index 7d6bd14f5..675d846e9 100644 --- a/source/lib/omnitrace/library/thread_info.cpp +++ b/source/lib/omnitrace/library/thread_info.cpp @@ -53,17 +53,20 @@ init_index_data(int64_t _tid, bool _offset = false) if(!itr) { threading::offset_this_id(_offset); - itr = thread_index_data{}; + itr = thread_index_data{}; + int _verb = 2; + // if thread created using finalization, bump up the minimum verbosity level + if(get_state() == State::Finalized && _offset) _verb += 2; if(!config::settings_are_configured()) { OMNITRACE_BASIC_VERBOSE_F( - 2, "Thread %li on PID %i (rank: %i) assigned omnitrace TID %li\n", + _verb, "Thread %li on PID %i (rank: %i) assigned omnitrace TID %li\n", itr->system_value, process::get_id(), dmp::rank(), itr->sequent_value); } else { OMNITRACE_VERBOSE_F( - 2, "Thread %li on PID %i (rank: %i) assigned omnitrace TID %li\n", + _verb, "Thread %li on PID %i (rank: %i) assigned omnitrace TID %li\n", itr->system_value, process::get_id(), dmp::rank(), itr->sequent_value); } } @@ -149,10 +152,12 @@ thread_info::set_stop(uint64_t _ts) { for(auto& itr : thread_info_data_t::instances()) { - if(itr && itr->index_data && itr->index_data->internal_value > _tid) + if(itr && itr->index_data && itr->index_data->internal_value != _tid) { if(itr->lifetime.second > _v->lifetime.second) itr->lifetime.second = _v->lifetime.second; + else if(itr->lifetime.second == 0) + itr->lifetime.second = _v->lifetime.second; } } } diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index e87c5e906..3a10ffdee 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -47,6 +47,7 @@ set(_flat_environment "OMNITRACE_TIMELINE_PROFILE=OFF" "OMNITRACE_COLLAPSE_PROCESSES=ON" "OMNITRACE_COLLAPSE_THREADS=ON" + "OMNITRACE_SAMPLING_FREQ=50" "OMNITRACE_TIMEMORY_COMPONENTS=wall_clock,trip_count" "${_test_openmp_env}" "${_test_library_path}") @@ -54,7 +55,7 @@ set(_flat_environment set(_lock_environment "OMNITRACE_USE_SAMPLING=ON" "OMNITRACE_USE_PROCESS_SAMPLING=OFF" - "OMNITRACE_SAMPLING_FREQ=250" + "OMNITRACE_SAMPLING_FREQ=750" "OMNITRACE_CRITICAL_TRACE=ON" "OMNITRACE_COLLAPSE_THREADS=ON" "OMNITRACE_TRACE_THREAD_LOCKS=ON" @@ -233,7 +234,7 @@ endfunction() # -------------------------------------------------------------------------------------- # function(OMNITRACE_ADD_TEST) - foreach(_PREFIX RUNTIME REWRITE REWRITE_RUN BASELINE) + foreach(_PREFIX PRELOAD RUNTIME REWRITE REWRITE_RUN BASELINE) foreach(_TYPE PASS FAIL SKIP) list(APPEND _REGEX_OPTS "${_PREFIX}_${_TYPE}_REGEX") endforeach() @@ -243,10 +244,9 @@ function(OMNITRACE_ADD_TEST) cmake_parse_arguments( TEST - "SKIP_BASELINE;SKIP_REWRITE;SKIP_RUNTIME;SKIP_SAMPLING;FORCE_SAMPLING" # options - "NAME;TARGET;MPI;GPU;NUM_PROCS;REWRITE_TIMEOUT;RUNTIME_TIMEOUT" # single value - # args - "${_KWARGS}" # multiple value args + "SKIP_BASELINE;SKIP_PRELOAD;SKIP_REWRITE;SKIP_RUNTIME;SKIP_SAMPLING;FORCE_SAMPLING" + "NAME;TARGET;MPI;GPU;NUM_PROCS;REWRITE_TIMEOUT;RUNTIME_TIMEOUT;PRELOAD" + "${_KWARGS}" ${ARGN}) if(TEST_GPU AND NOT _VALID_GPU) @@ -278,6 +278,10 @@ function(OMNITRACE_ADD_TEST) set(TEST_RUNTIME_TIMEOUT 300) endif() + if(NOT TEST_PRELOAD_TIMEOUT) + set(TEST_PRELOAD_TIMEOUT 120) + endif() + if(NOT DEFINED TEST_ENVIRONMENT OR "${TEST_ENVIRONMENT}" STREQUAL "") set(TEST_ENVIRONMENT "${_test_environment}") endif() @@ -308,6 +312,14 @@ function(OMNITRACE_ADD_TEST) WORKING_DIRECTORY ${PROJECT_BINARY_DIR}) endif() + if(NOT TEST_SKIP_PRELOAD) + add_test( + NAME ${TEST_NAME}-preload + COMMAND ${COMMAND_PREFIX} $ -- + $ ${TEST_RUN_ARGS} + WORKING_DIRECTORY ${PROJECT_BINARY_DIR}) + endif() + if(NOT TEST_SKIP_REWRITE) add_test( NAME ${TEST_NAME}-binary-rewrite @@ -370,27 +382,29 @@ function(OMNITRACE_ADD_TEST) foreach( _TEST - baseline binary-rewrite binary-rewrite-run binary-rewrite-sampling + baseline preload binary-rewrite binary-rewrite-run binary-rewrite-sampling binary-rewrite-sampling-run runtime-instrument runtime-instrument-sampling) string(REGEX REPLACE "-run(-|/)" "\\1" _prefix "${TEST_NAME}-${_TEST}/") - set(_environ "${TEST_ENVIRONMENT}") - set(_labels "${_TEST}") - set(_timeout ${TEST_REWRITE_TIMEOUT}) - list(APPEND _environ "OMNITRACE_OUTPUT_PATH=omnitrace-tests-output" - "OMNITRACE_OUTPUT_PREFIX=${_prefix}") + set(_labels "${TEST_LABELS}" "${_TEST}") string(REPLACE "-run" "" _labels "${_TEST}") string(REPLACE "-sampling" ";sampling" _labels "${_labels}") if(TEST_TARGET) list(APPEND _labels "${TEST_TARGET}") endif() - if("${_TEST}" MATCHES "runtime-instrument") + + set(_environ + "${TEST_ENVIRONMENT}" "OMNITRACE_OUTPUT_PATH=omnitrace-tests-output" + "OMNITRACE_OUTPUT_PREFIX=${_prefix}") + + set(_timeout ${TEST_REWRITE_TIMEOUT}) + if("${_TEST}" MATCHES "preload") + set(_timeout ${TEST_PRELOAD_TIMEOUT}) + elseif("${_TEST}" MATCHES "runtime-instrument") set(_timeout ${TEST_RUNTIME_TIMEOUT}) endif() - if("${_TEST}" MATCHES "binary-rewrite-run") - list(APPEND _labels "binary-rewrite-run") - endif() + set(_props) - if("${_TEST}" MATCHES "run|baseline") + if("${_TEST}" MATCHES "run|preload|baseline") set(_props ${TEST_PROPERTIES}) if(NOT "RUN_SERIAL" IN_LIST _props) list(APPEND _props RUN_SERIAL ON) @@ -405,6 +419,8 @@ function(OMNITRACE_ADD_TEST) set(_REGEX_VAR REWRITE) elseif("${_TEST}" MATCHES "baseline") set(_REGEX_VAR BASELINE) + elseif("${_TEST}" MATCHES "preload") + set(_REGEX_VAR PRELOAD) else() set(_REGEX_VAR) endif() @@ -426,7 +442,7 @@ function(OMNITRACE_ADD_TEST) TIMEOUT ${_timeout} LABELS - "${_labels};${TEST_LABELS}" + "${_labels}" PASS_REGULAR_EXPRESSION "${${_PASS_REGEX}}" FAIL_REGULAR_EXPRESSION @@ -647,7 +663,7 @@ omnitrace_add_test( NUM_PROCS ${NUM_PROCS} REWRITE_ARGS -e -v 2 -E uniform_int_distribution ENVIRONMENT - "${_base_environment};OMNITRACE_CRITICAL_TRACE=OFF;OMNITRACE_COUT_OUTPUT=ON;OMNITRACE_ROCM_EVENTS=${OMNITRACE_ROCM_EVENTS_TEST}" + "${_base_environment};OMNITRACE_CRITICAL_TRACE=OFF;OMNITRACE_ROCM_EVENTS=${OMNITRACE_ROCM_EVENTS_TEST}" REWRITE_RUN_PASS_REGEX "rocprof-device-0-GRBM_COUNT.txt(.*)rocprof-device-0-GPUBusy.txt(.*)rocprof-device-0-SQ_WAVES.txt(.*)rocprof-device-0-SQ_INSTS_VALU.txt(.*)rocprof-device-0-VALUInsts.txt(.*)rocprof-device-0-TCC_HIT_sum.txt(.*)rocprof-device-0-TA_TA_BUSY_0.txt(.*)rocprof-device-0-TA_TA_BUSY_11.txt" ) @@ -662,7 +678,7 @@ omnitrace_add_test( NUM_PROCS ${NUM_PROCS} REWRITE_ARGS -e -v 2 -E uniform_int_distribution ENVIRONMENT - "${_base_environment};OMNITRACE_CRITICAL_TRACE=OFF;OMNITRACE_COUT_OUTPUT=ON;OMNITRACE_USE_ROCTRACER=OFF;OMNITRACE_ROCM_EVENTS=${OMNITRACE_ROCM_EVENTS_TEST}" + "${_base_environment};OMNITRACE_CRITICAL_TRACE=OFF;OMNITRACE_USE_ROCTRACER=OFF;OMNITRACE_ROCM_EVENTS=${OMNITRACE_ROCM_EVENTS_TEST}" REWRITE_RUN_PASS_REGEX "rocprof-device-0-GRBM_COUNT.txt(.*)rocprof-device-0-GPUBusy.txt(.*)rocprof-device-0-SQ_WAVES.txt(.*)rocprof-device-0-SQ_INSTS_VALU.txt(.*)rocprof-device-0-VALUInsts.txt(.*)rocprof-device-0-TCC_HIT_sum.txt(.*)rocprof-device-0-TA_TA_BUSY_0.txt(.*)rocprof-device-0-TA_TA_BUSY_11.txt" REWRITE_RUN_FAIL_REGEX "roctracer.txt") @@ -782,7 +798,6 @@ if(OMNITRACE_USE_MPI OR OMNITRACE_USE_MPI_HEADERS) --label file line - return args --min-instructions 0 @@ -806,7 +821,6 @@ if(OMNITRACE_USE_MPI OR OMNITRACE_USE_MPI_HEADERS) --label file line - return args --min-instructions 0 @@ -836,9 +850,9 @@ omnitrace_add_test( -ME [==[lib(gomp|m-)]==] LABELS "kokkos;kokkos-profile-library" - RUN_ARGS -i 10 -s 20 -p + RUN_ARGS -i 25 -s 20 -p ENVIRONMENT - "${_base_environment};OMNITRACE_CRITICAL_TRACE=OFF;OMNITRACE_USE_KOKKOSP=ON;OMNITRACE_COUT_OUTPUT=ON;KOKKOS_PROFILE_LIBRARY=libomnitrace-dl.so" + "${_base_environment};OMNITRACE_CRITICAL_TRACE=OFF;OMNITRACE_USE_KOKKOSP=ON;OMNITRACE_COUT_OUTPUT=ON;OMNITRACE_SAMPLING_FREQ=50;KOKKOS_PROFILE_LIBRARY=libomnitrace-dl.so" REWRITE_RUN_PASS_REGEX "\\|_\\[kokkos\\]" RUNTIME_PASS_REGEX "\\|_\\[kokkos\\]") @@ -852,9 +866,8 @@ omnitrace_add_test( LABELS "kokkos;kokkos-profile-library" RUN_ARGS -i 10 -s 20 -p ENVIRONMENT - "${_base_environment};OMNITRACE_CRITICAL_TRACE=OFF;OMNITRACE_USE_KOKKOSP=ON;OMNITRACE_COUT_OUTPUT=ON;KOKKOS_PROFILE_LIBRARY=libomnitrace.so" - BASELINE_PASS_REGEX - "\\|_\\[kokkos\\]") + "${_base_environment};OMNITRACE_CRITICAL_TRACE=OFF;OMNITRACE_USE_KOKKOSP=ON;OMNITRACE_COUT_OUTPUT=ON;OMNITRACE_SAMPLING_FREQ=50;KOKKOS_PROFILE_LIBRARY=libomnitrace.so" + BASELINE_PASS_REGEX "\\|_\\[kokkos\\]") omnitrace_add_test( SKIP_RUNTIME SKIP_REWRITE @@ -866,9 +879,8 @@ omnitrace_add_test( LABELS "kokkos;kokkos-profile-library" RUN_ARGS -i 10 -s 20 -p ENVIRONMENT - "${_base_environment};OMNITRACE_CRITICAL_TRACE=OFF;OMNITRACE_USE_KOKKOSP=ON;OMNITRACE_COUT_OUTPUT=ON;KOKKOS_PROFILE_LIBRARY=libomnitrace-dl.so" - BASELINE_PASS_REGEX - "\\|_\\[kokkos\\]") + "${_base_environment};OMNITRACE_CRITICAL_TRACE=OFF;OMNITRACE_USE_KOKKOSP=ON;OMNITRACE_COUT_OUTPUT=ON;OMNITRACE_SAMPLING_FREQ=50;KOKKOS_PROFILE_LIBRARY=libomnitrace-dl.so" + BASELINE_PASS_REGEX "\\|_\\[kokkos\\]") omnitrace_add_test( SKIP_BASELINE @@ -972,10 +984,46 @@ omnitrace_add_test( REWRITE_TIMEOUT 180 RUNTIME_TIMEOUT 360 ENVIRONMENT - "${_ompt_environment};OMNITRACE_USE_SAMPLING=ON;OMNITRACE_SAMPLING_FREQ=100;OMNITRACE_COUT_OUTPUT=ON" + "${_ompt_environment};OMNITRACE_USE_SAMPLING=ON;OMNITRACE_SAMPLING_FREQ=50;OMNITRACE_COUT_OUTPUT=ON" REWRITE_RUN_PASS_REGEX "${_OMPT_PASS_REGEX}" REWRITE_FAIL_REGEX "0 instrumented loops in procedure") +set(_ompt_preload_environ + "${_ompt_environment}" + "OMNITRACE_USE_OMPT=OFF" + "OMNITRACE_USE_SAMPLING=ON" + "OMNITRACE_USE_PROCESS_SAMPLING=OFF" + "OMNITRACE_SAMPLING_FREQ=100" + "OMNITRACE_SAMPLING_DELAY=0.1" + "OMNITRACE_SAMPLING_DURATION=0.25" + "OMNITRACE_SAMPLING_CPUTIME=ON" + "OMNITRACE_SAMPLING_REALTIME=ON" + "OMNITRACE_SAMPLING_CPUTIME_FREQ=1000" + "OMNITRACE_SAMPLING_REALTIME_FREQ=500") + +set(_ompt_preload_samp_regex + "Sampler for thread 0 will be triggered 1000.0x per second of CPU-time(.*)Sampler for thread 0 will be triggered 500.0x per second of wall-time(.*)Sampling will be disabled after 0.250000 seconds(.*)Sampling duration of 0.250000 seconds has elapsed. Shutting down sampling" + ) +set(_ompt_preload_file_regex + "sampling-duration-preload/sampling_percent.(json|txt)(.*)sampling-duration-preload/sampling_cpu_clock.(json|txt)(.*)sampling-duration-preload/sampling_wall_clock.(json|txt)" + ) + +omnitrace_add_test( + SKIP_BASELINE SKIP_RUNTIME SKIP_REWRITE SKIP_SAMPLING + NAME openmp-cg-sampling-duration + TARGET openmp-cg + LABELS "openmp;sampling-duration" + ENVIRONMENT "${_ompt_preload_environ}" + PRELOAD_PASS_REGEX "${_ompt_preload_samp_regex}(.*)${_ompt_preload_file_regex}") + +omnitrace_add_test( + SKIP_BASELINE SKIP_RUNTIME SKIP_REWRITE SKIP_SAMPLING + NAME openmp-lu-sampling-duration + TARGET openmp-lu + LABELS "openmp;sampling-duration" + ENVIRONMENT "${_ompt_preload_environ}" + PRELOAD_PASS_REGEX "${_ompt_preload_samp_regex}(.*)${_ompt_preload_file_regex}") + omnitrace_add_test( SKIP_BASELINE SKIP_SAMPLING NAME code-coverage