diff --git a/.cmake-format.yaml b/.cmake-format.yaml index 6ee48f754..8e661d6b8 100644 --- a/.cmake-format.yaml +++ b/.cmake-format.yaml @@ -28,6 +28,7 @@ parse: NAME: '*' TARGET: '*' MPI: '*' + GPU: '*' NUM_PROCS: '*' REWRITE_TIMEOUT: '*' RUNTIME_TIMEOUT: '*' diff --git a/.github/workflows/opensuse.yml b/.github/workflows/opensuse.yml index 71290a708..1bc056db3 100644 --- a/.github/workflows/opensuse.yml +++ b/.github/workflows/opensuse.yml @@ -121,5 +121,6 @@ jobs: with: name: data-${{ github.job }}-files path: | + build/omnitrace-tests-config/*.cfg build/omnitrace-tests-output/**/*.txt build/omnitrace-tests-output/**/*-instr*.json diff --git a/.github/workflows/ubuntu-bionic.yml b/.github/workflows/ubuntu-bionic.yml index 823f44e69..8b2b8806d 100644 --- a/.github/workflows/ubuntu-bionic.yml +++ b/.github/workflows/ubuntu-bionic.yml @@ -150,5 +150,6 @@ jobs: with: name: data-${{ github.job }}-files path: | + build/omnitrace-tests-config/*.cfg build/omnitrace-tests-output/**/*.txt build/omnitrace-tests-output/**/*-instr*.json diff --git a/.github/workflows/ubuntu-focal.yml b/.github/workflows/ubuntu-focal.yml index 56327d0ba..c70b01302 100644 --- a/.github/workflows/ubuntu-focal.yml +++ b/.github/workflows/ubuntu-focal.yml @@ -66,7 +66,7 @@ jobs: add-apt-repository -y ppa:ubuntu-toolchain-r/test && apt-get update && apt-get upgrade -y && - apt-get install -y build-essential m4 autoconf libtool python3-pip libiberty-dev clang libomp-dev ${{ matrix.compiler }} && + apt-get install -y build-essential m4 autoconf libtool python3-pip libiberty-dev clang libomp-dev libmpich-dev mpich ${{ matrix.compiler }} && python3 -m pip install --upgrade pip && python3 -m pip install numpy && python3 -m pip install perfetto && @@ -166,6 +166,7 @@ jobs: with: name: data-${{ github.job }}-files path: | + build/omnitrace-tests-config/*.cfg build/omnitrace-tests-output/**/*.txt build/omnitrace-tests-output/**/*-instr*.json @@ -176,12 +177,23 @@ jobs: strategy: matrix: compiler: ['g++'] - rocm_version: ['4.3', '4.5', 'debian'] + rocm_version: ['4.3', '4.5', '5.0'] + mpi_headers: ['OFF'] + build_jobs: ['4'] + ctest_exclude: ['-LE "mpi-example|transpose"'] + perfetto-tools: ['ON'] + include: + - compiler: 'g++' + rocm_version: 'debian' + mpi_headers: 'ON' + build_jobs: '2' + ctest_exclude: '-LE transpose' + perfetto-tools: 'OFF' env: BUILD_TYPE: MinSizeRel - OMNITRACE_OUTPUT_PATH: omnitrace-tests-output - OMNITRACE_OUTPUT_PREFIX: "%argt%/" + OMPI_ALLOW_RUN_AS_ROOT: 1 + OMPI_ALLOW_RUN_AS_ROOT_CONFIRM: 1 steps: - uses: actions/checkout@v2 @@ -194,18 +206,41 @@ jobs: wget -q -O - https://repo.radeon.com/rocm/rocm.gpg.key | apt-key add - && echo "deb [arch=amd64] https://repo.radeon.com/rocm/apt/${{ matrix.rocm_version }}/ ubuntu main" | tee /etc/apt/sources.list.d/rocm.list && apt-get update && - apt-get install -y build-essential m4 autoconf libtool python3-pip clang libomp-dev ${{ matrix.compiler }} libudev-dev libnuma-dev rocm-dev rocm-utils roctracer-dev rocprofiler-dev hip-base hsa-amd-aqlprofile hsa-rocr-dev hsakmt-roct-dev libpapi-dev libopenmpi-dev curl && + apt-get install -y build-essential m4 autoconf libtool python3-pip clang libomp-dev ${{ matrix.compiler }} libudev-dev libnuma-dev rocm-dev rocm-utils rocm-smi-lib roctracer-dev rocprofiler-dev hip-base hsa-amd-aqlprofile hsa-rocr-dev hsakmt-roct-dev libpapi-dev curl libopenmpi-dev openmpi-bin libfabric-dev && python3 -m pip install --upgrade pip && - python3 -m pip install 'cmake==3.16.3' && - for i in 6 7 8 9; do /opt/conda/envs/py3.${i}/bin/python -m pip install numpy perfetto dataclasses; done + python3 -m pip install 'cmake==3.21.4' && + for i in 6 7 8 9 10; do /opt/conda/envs/py3.${i}/bin/python -m pip install numpy perfetto dataclasses; done - - name: Configure Env + - name: Install RCCL + if: ${{ matrix.rocm_version != '4.3' }} + timeout-minutes: 5 run: - echo "CC=$(echo '${{ matrix.compiler }}' | sed 's/+/c/g')" >> $GITHUB_ENV && - echo "CXX=${{ matrix.compiler }}" >> $GITHUB_ENV && - echo "CMAKE_PREFIX_PATH=/opt/dyninst:/opt/elfutils:${CMAKE_PREFIX_PATH}" >> $GITHUB_ENV && - echo "/opt/omnitrace/bin:/opt/dyninst/bin:/opt/elfutils/bin:${HOME}/.local/bin" >> $GITHUB_PATH && - echo "LD_LIBRARY_PATH=/opt/omnitrace/lib:/opt/dyninst/lib:/opt/elfutils/lib:${LD_LIBRARY_PATH}" >> $GITHUB_ENV + apt-get install -y rccl-dev + + - name: Configure Env + run: | + echo "CC=$(echo '${{ matrix.compiler }}' | sed 's/+/c/g')" >> $GITHUB_ENV + echo "CXX=${{ matrix.compiler }}" >> $GITHUB_ENV + echo "CMAKE_PREFIX_PATH=/opt/dyninst:/opt/elfutils:${CMAKE_PREFIX_PATH}" >> $GITHUB_ENV + echo "LD_LIBRARY_PATH=/opt/dyninst/lib:/opt/elfutils/lib:${LD_LIBRARY_PATH}" >> $GITHUB_ENV + cat << EOF > test-install.cfg + OMNITRACE_USE_TIMEMORY = ON + OMNITRACE_USE_PERFETTO = ON + OMNITRACE_USE_PID = OFF + OMNITRACE_USE_SAMPLING = OFF + OMNITRACE_USE_PROCESS_SAMPLING = OFF + OMNITRACE_COUT_OUTPUT = ON + OMNITRACE_TIME_OUTPUT = OFF + OMNITRACE_TIMEMORY_COMPONENTS = cpu_clock cpu_util current_peak_rss kernel_mode_time monotonic_clock monotonic_raw_clock network_stats num_io_in num_io_out num_major_page_faults num_minor_page_faults page_rss peak_rss priority_context_switch process_cpu_clock process_cpu_util read_bytes read_char system_clock thread_cpu_clock thread_cpu_util timestamp trip_count user_clock user_mode_time virtual_memory voluntary_context_switch wall_clock written_bytes written_char + OMNITRACE_OUTPUT_PATH = omnitrace-tests-output + OMNITRACE_OUTPUT_PREFIX = %tag%/ + OMNITRACE_DEBUG = OFF + OMNITRACE_VERBOSE = 3 + OMNITRACE_DL_VERBOSE = 3 + OMNITRACE_PERFETTO_BACKEND = system + EOF + realpath test-install.cfg + cat test-install.cfg - name: Configure CMake timeout-minutes: 10 @@ -217,22 +252,27 @@ jobs: -DCMAKE_CXX_COMPILER=${{ matrix.compiler }} -DCMAKE_BUILD_TYPE=${{ env.BUILD_TYPE }} -DCMAKE_INSTALL_PREFIX=/opt/omnitrace - -DOMNITRACE_BUILD_TESTING=OFF + -DOMNITRACE_BUILD_TESTING=ON -DOMNITRACE_BUILD_DEVELOPER=ON -DOMNITRACE_BUILD_EXTRA_OPTIMIZATIONS=OFF -DOMNITRACE_BUILD_LTO=OFF -DOMNITRACE_USE_MPI=OFF - -DOMNITRACE_USE_MPI_HEADERS=ON -DOMNITRACE_USE_HIP=ON -DOMNITRACE_MAX_THREADS=32 - -DOMNITRACE_USE_SANITIZER=OFF -DOMNITRACE_USE_PAPI=OFF - -DOMNITRACE_INSTALL_PERFETTO_TOOLS=ON + -DOMNITRACE_USE_OMPT=OFF + -DOMNITRACE_USE_PYTHON=ON + -DOMNITRACE_USE_MPI_HEADERS=${{ matrix.mpi_headers }} + -DOMNITRACE_USE_SANITIZER=OFF + -DOMNITRACE_INSTALL_PERFETTO_TOOLS=${{ matrix.perfetto-tools }} + -DOMNITRACE_PYTHON_PREFIX=/opt/conda/envs + -DOMNITRACE_PYTHON_ENVS="py3.6;py3.7;py3.8;py3.9;py3.10" + -DOMNITRACE_CI_MPI_RUN_AS_ROOT=${{ matrix.mpi_headers }} - name: Build timeout-minutes: 60 run: - cmake --build build --target all --parallel 2 -- VERBOSE=1 + cmake --build build --target all --parallel ${{ matrix.build_jobs }} -- VERBOSE=1 - name: Install run: @@ -244,20 +284,28 @@ jobs: cd build && ldd ./bin/omnitrace && ./bin/omnitrace --help && - ctest -V -N -O omnitrace-ctest-${{ github.job }}-commands.log && - ctest -V --output-log omnitrace-ctest-${{ github.job }}.log --stop-on-failure + ctest -V ${{ matrix.ctest_exclude }} -N -O omnitrace-ctest-${{ github.job }}-commands.log && + ctest -V ${{ matrix.ctest_exclude }} --output-log omnitrace-ctest-${{ github.job }}.log --stop-on-failure + + - name: Configure Install Env + run: | + echo "/opt/omnitrace/bin" >> $GITHUB_PATH + echo "LD_LIBRARY_PATH=/opt/omnitrace/lib:${LD_LIBRARY_PATH}" >> $GITHUB_ENV + echo "OMNITRACE_CONFIG_FILE=test-install.cfg" >> $GITHUB_ENV - name: Test Install timeout-minutes: 10 + if: ${{ matrix.perfetto-tools == 'ON' }} run: | set -v + cat ${OMNITRACE_CONFIG_FILE} omnitrace-perfetto-traced --background - export OMNITRACE_DEBUG=ON - export OMNITRACE_PERFETTO_BACKEND=system which omnitrace-avail ldd $(which omnitrace-avail) omnitrace-avail --help omnitrace-avail -a + which omnitrace-python + omnitrace-python --help which omnitrace-critical-trace ldd $(which omnitrace-critical-trace) which omnitrace @@ -272,6 +320,31 @@ jobs: du -m ls-perfetto-trace.proto /opt/conda/envs/py3.8/bin/python ./tests/validate-perfetto-proto.py -p -i ./ls-perfetto-trace.proto + - name: Test Install + timeout-minutes: 10 + if: ${{ matrix.perfetto-tools == 'OFF' }} + run: | + set -v + cat ${OMNITRACE_CONFIG_FILE} + which omnitrace-avail + ldd $(which omnitrace-avail) + omnitrace-avail --help + omnitrace-avail -a + which omnitrace-python + omnitrace-python --help + which omnitrace-critical-trace + ldd $(which omnitrace-critical-trace) + which omnitrace + ldd $(which omnitrace) + omnitrace --help + omnitrace -e -v 1 -o sleep.inst --simulate -- sleep + omnitrace -e -v 1 --simulate -- sleep + omnitrace -e -v 1 -o sleep.inst -- sleep + ./sleep.inst 5 + omnitrace -e -v 1 -- sleep 5 + cat omnitrace-tests-output/sleep.inst/wall_clock.txt + cat omnitrace-tests-output/sleep/wall_clock.txt + - name: Test User API timeout-minutes: 10 run: | @@ -293,6 +366,7 @@ jobs: name: data-${{ github.job }}-files path: | omnitrace-tests-output/**/*.txt + build/omnitrace-tests-config/*.cfg build/omnitrace-tests-output/**/*.txt build/omnitrace-tests-output/**/*-instr*.json @@ -445,5 +519,6 @@ jobs: with: name: data-${{ github.job }}-files path: | + ${{ github.workspace }}/build/omnitrace-tests-config/*.cfg ${{ github.workspace }}/build/omnitrace-tests-output/**/*.txt ${{ github.workspace }}/build/omnitrace-tests-output/**/*-instr*.json diff --git a/CMakeLists.txt b/CMakeLists.txt index 3ae15ed8b..c55fba1b8 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -126,6 +126,7 @@ omnitrace_add_option(OMNITRACE_USE_ROCPROFILER "Enable rocprofiler support" omnitrace_add_option( OMNITRACE_USE_ROCM_SMI "Enable rocm-smi support for power/temp/etc. sampling" ${OMNITRACE_USE_HIP}) +omnitrace_add_option(OMNITRACE_USE_RCCL "Enable RCCL support" ${OMNITRACE_USE_HIP}) omnitrace_add_option(OMNITRACE_USE_MPI_HEADERS "Enable wrapping MPI functions w/o enabling MPI dependency" ON) omnitrace_add_option(OMNITRACE_USE_OMPT "Enable OpenMP tools support" ON) @@ -175,14 +176,18 @@ if(NOT OMNITRACE_USE_HIP) set(OMNITRACE_USE_ROCM_SMI OFF CACHE BOOL "Disabled via OMNITRACE_USE_HIP=OFF" FORCE) + set(OMNITRACE_USE_RCCL + OFF + CACHE BOOL "Disabled via OMNITRACE_USE_HIP=OFF" FORCE) elseif( OMNITRACE_USE_HIP AND NOT OMNITRACE_USE_ROCTRACER AND NOT OMNITRACE_USE_ROCPROFILER - AND NOT OMNITRACE_USE_ROCM_SMI) + AND NOT OMNITRACE_USE_ROCM_SMI + AND NOT OMNITRACE_USE_RCCL) omnitrace_message( AUTHOR_WARNING - "Setting OMNITRACE_USE_HIP=OFF because roctracer, rocprofiler, and rocm-smi options are disabled" + "Setting OMNITRACE_USE_HIP=OFF because roctracer, rocprofiler, rccl, and rocm-smi options are disabled" ) set(OMNITRACE_USE_HIP OFF) endif() diff --git a/cmake/ConfigCPack.cmake b/cmake/ConfigCPack.cmake index 15f16e742..4aa3bdd2c 100644 --- a/cmake/ConfigCPack.cmake +++ b/cmake/ConfigCPack.cmake @@ -157,6 +157,7 @@ if(NOT OMNITRACE_BUILD_DYNINST) endif() endif() if(ROCmVersion_FOUND) + set(_ROCPROFILER_SUFFIX " (>= 1.0.0.${ROCmVersion_NUMERIC_VERSION})") set(_ROCTRACER_SUFFIX " (>= 1.0.0.${ROCmVersion_NUMERIC_VERSION})") set(_ROCM_SMI_SUFFIX " (>= ${ROCmVersion_MAJOR_VERSION}.0.0.${ROCmVersion_NUMERIC_VERSION})") @@ -167,6 +168,9 @@ endif() if(OMNITRACE_USE_ROCTRACER) list(APPEND _DEBIAN_PACKAGE_DEPENDS "roctracer-dev${_ROCTRACER_SUFFIX}") endif() +if(OMNITRACE_USE_ROCPROFILER) + list(APPEND _DEBIAN_PACKAGE_DEPENDS "rocprofiler-dev${_ROCPROFILER_SUFFIX}") +endif() if(OMNITRACE_USE_MPI) if("${OMNITRACE_MPI_IMPL}" STREQUAL "openmpi") list(APPEND _DEBIAN_PACKAGE_DEPENDS "libopenmpi-dev") diff --git a/cmake/Modules/FindRCCL-Headers.cmake b/cmake/Modules/FindRCCL-Headers.cmake new file mode 100644 index 000000000..f70876833 --- /dev/null +++ b/cmake/Modules/FindRCCL-Headers.cmake @@ -0,0 +1,94 @@ +# Distributed under the OSI-approved BSD 3-Clause License. See accompanying file +# Copyright.txt or https://cmake.org/licensing for details. + +include(FindPackageHandleStandardArgs) + +# ----------------------------------------------------------------------------------------# + +set(RCCL-Headers_INCLUDE_DIR_INTERNAL + "${PROJECT_SOURCE_DIR}/source/lib/omnitrace/library/tpls/rccl" + CACHE PATH "Path to internal rccl.h") + +# ----------------------------------------------------------------------------------------# + +if(NOT ROCM_PATH AND NOT "$ENV{ROCM_PATH}" STREQUAL "") + set(ROCM_PATH "$ENV{ROCM_PATH}") +endif() + +foreach(_DIR ${ROCmVersion_DIR} ${ROCM_PATH} /opt/rocm /opt/rocm/rccl) + if(EXISTS ${_DIR}) + get_filename_component(_ABS_DIR "${_DIR}" REALPATH) + list(APPEND _RCCL_PATHS ${_ABS_DIR}) + endif() +endforeach() + +# ----------------------------------------------------------------------------------------# + +find_package( + rccl + QUIET + CONFIG + HINTS + ${_RCCL_PATHS} + PATHS + ${_RCCL_PATHS} + PATH_SUFFIXES + rccl/lib/cmake) + +if(NOT rccl_FOUND) + set(RCCL-Headers_INCLUDE_DIR + "${RCCL-Headers_INCLUDE_DIR_INTERNAL}" + CACHE PATH "Path to RCCL headers") +else() + set(RCCL-Headers_INCLUDE_DIR + "${rccl_INCLUDE_DIR}" + CACHE PATH "Path to RCCL headers") +endif() + +# because of the annoying warning starting with v5.2.0, we've got to do this crap +if(ROCmVersion_NUMERIC_VERSION) + if(ROCmVersion_NUMERIC_VERSION LESS 50200) + set(_RCCL-Headers_FILE "rccl.h") + set(_RCCL-Headers_DIR "/rccl") + else() + set(_RCCL-Headers_FILE "rccl/rccl.h") + set(_RCCL-Headers_DIR "") + endif() +else() + set(_RCCL-Headers_FILE "rccl/rccl.h") + set(_RCCL-Headers_DIR "") +endif() + +if(NOT EXISTS "${RCCL-Headers_INCLUDE_DIR}/${_RCCL-Headers_FILE}") + omnitrace_message( + AUTHOR_WARNING + "RCCL header (${RCCL-Headers_INCLUDE_DIR}/${_RCCL-Headers_FILE}) does not exist! Setting RCCL-Headers_INCLUDE_DIR to internal RCCL include directory: ${RCCL-Headers_INCLUDE_DIR_INTERNAL}" + ) + set(RCCL-Headers_INCLUDE_DIR + "${RCCL-Headers_INCLUDE_DIR_INTERNAL}${_RCCL-Headers_DIR}" + CACHE PATH "Path to RCCL headers" FORCE) +endif() + +unset(_RCCL-Headers_FILE) +unset(_RCCL-Headers_DIR) + +mark_as_advanced(RCCL-Headers_INCLUDE_DIR) + +# ----------------------------------------------------------------------------------------# + +find_package_handle_standard_args(RCCL-Headers DEFAULT_MSG RCCL-Headers_INCLUDE_DIR) + +# ------------------------------------------------------------------------------# + +if(RCCL-Headers_FOUND) + add_library(roc::rccl-headers INTERFACE IMPORTED) + set(RCCL-Headers_INCLUDE_DIRS ${RCCL-Headers_INCLUDE_DIR}) + + target_include_directories(roc::rccl-headers SYSTEM + INTERFACE ${RCCL-Headers_INCLUDE_DIR}) + + add_library(RCCL-Headers::RCCL-Headers INTERFACE IMPORTED) + target_link_libraries(RCCL-Headers::RCCL-Headers INTERFACE roc::rccl-headers) +endif() + +# ------------------------------------------------------------------------------# diff --git a/cmake/Packages.cmake b/cmake/Packages.cmake index 72856d79b..54dbb381a 100644 --- a/cmake/Packages.cmake +++ b/cmake/Packages.cmake @@ -20,6 +20,8 @@ omnitrace_add_interface_library(omnitrace-rocprofiler "Provides flags and libraries for rocprofiler") omnitrace_add_interface_library(omnitrace-rocm-smi "Provides flags and libraries for rocm-smi") +omnitrace_add_interface_library( + omnitrace-rccl "Provides flags for ROCm Communication Collectives Library (RCCL)") omnitrace_add_interface_library(omnitrace-mpi "Provides MPI or MPI headers") omnitrace_add_interface_library(omnitrace-ptl "Enables PTL support (tasking)") omnitrace_add_interface_library(omnitrace-papi "Enable PAPI support") @@ -37,6 +39,7 @@ set(OMNITRACE_EXTENSION_LIBRARIES omnitrace::omnitrace-roctracer omnitrace::omnitrace-rocprofiler omnitrace::omnitrace-rocm-smi + omnitrace::omnitrace-rccl omnitrace::omnitrace-mpi omnitrace::omnitrace-ptl omnitrace::omnitrace-ompt @@ -196,6 +199,17 @@ if(OMNITRACE_USE_ROCM_SMI) set(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}:${rocm-smi_LIBRARY_DIRS}") endif() +# ----------------------------------------------------------------------------------------# +# +# RCCL +# +# ----------------------------------------------------------------------------------------# +if(OMNITRACE_USE_RCCL) + find_package(RCCL-Headers ${omnitrace_FIND_QUIETLY} REQUIRED) + target_link_libraries(omnitrace-rccl INTERFACE roc::rccl-headers) + omnitrace_target_compile_definitions(omnitrace-rccl INTERFACE OMNITRACE_USE_RCCL) +endif() + # ----------------------------------------------------------------------------------------# # # MPI diff --git a/docker/Dockerfile.centos b/docker/Dockerfile.centos index 4889f4be2..bf2611339 100644 --- a/docker/Dockerfile.centos +++ b/docker/Dockerfile.centos @@ -24,7 +24,7 @@ ARG AMDGPU_RPM=21.40.2/rhel/7.9/amdgpu-install-21.40.2.40502-1.el7.noarch.rpm RUN yum install -y https://repo.radeon.com/amdgpu-install/${AMDGPU_RPM} && \ amdgpu-install --usecase=rocm,hip,hiplibsdk --no-dkms --skip-broken -y && \ - yum install -y rocm-hip-sdk roctracer-dev rocm-smi-lib rocprofiler-dev && \ + yum install -y rocm-hip-sdk rocm-smi-lib roctracer-dev rocprofiler-dev rccl-dev && \ yum update -y && \ yum clean all diff --git a/docker/Dockerfile.opensuse b/docker/Dockerfile.opensuse index 81c138607..d6a6e8278 100644 --- a/docker/Dockerfile.opensuse +++ b/docker/Dockerfile.opensuse @@ -25,7 +25,7 @@ RUN zypper --no-gpg-checks install -y https://repo.radeon.com/amdgpu-install/${A zypper addrepo https://download.opensuse.org/repositories/devel:languages:perl/SLE_15/devel:languages:perl.repo && \ zypper --non-interactive --gpg-auto-import-keys refresh && \ amdgpu-install --usecase=rocm,hip,hiplibsdk --no-dkms -y && \ - zypper install -y rocm-hip-sdk roctracer-dev rocm-smi-lib rocprofiler-dev && \ + zypper install -y rocm-hip-sdk rocm-smi-lib roctracer-dev rocprofiler-dev rccl-dev && \ zypper clean --all ARG PYTHON_VERSIONS="6 7 8 9 10" diff --git a/docker/Dockerfile.ubuntu b/docker/Dockerfile.ubuntu index d822cddb6..2fedac199 100644 --- a/docker/Dockerfile.ubuntu +++ b/docker/Dockerfile.ubuntu @@ -28,7 +28,7 @@ RUN apt-get update && \ echo "deb [arch=amd64] https://repo.radeon.com/rocm/apt/${ROCM_REPO_VERSION}/ ${ROCM_REPO_DIST} main" | tee /etc/apt/sources.list.d/rocm.list && \ apt-get update && \ apt-get dist-upgrade -y && \ - apt-get install -y rocm-dev rocm-utils roctracer-dev rocprofiler-dev hip-base hsa-amd-aqlprofile hsa-rocr-dev hsakmt-roct-dev ${EXTRA_PACKAGES} && \ + apt-get install -y rocm-dev rocm-utils rocm-smi-lib roctracer-dev rocprofiler-dev rccl-dev hip-base hsa-amd-aqlprofile hsa-rocr-dev hsakmt-roct-dev ${EXTRA_PACKAGES} && \ apt-get autoclean RUN wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh && \ diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index a3a5d7506..3cfdd7958 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -32,3 +32,4 @@ add_subdirectory(openmp) add_subdirectory(mpi) add_subdirectory(python) add_subdirectory(lulesh) +add_subdirectory(rccl) diff --git a/examples/rccl/CMakeLists.txt b/examples/rccl/CMakeLists.txt new file mode 100644 index 000000000..275b16012 --- /dev/null +++ b/examples/rccl/CMakeLists.txt @@ -0,0 +1,61 @@ +cmake_minimum_required(VERSION 3.16 FATAL_ERROR) + +project(omnitrace-rccl-example LANGUAGES CXX) + +find_package(rccl) +find_package(hip HINTS ${ROCmVersion_DIR} PATHS ${ROCmVersion_DIR}) + +function(rccl_message) + if("${CMAKE_PROJECT_NAME}" STREQUAL "omnitrace") + omnitrace_message(${ARGN}) + else() + message(${ARGN}) + endif() +endfunction() + +if(hip_FOUND AND rccl_FOUND) + include(FetchContent) + fetchcontent_declare( + rccl-tests GIT_REPOSITORY https://github.com/ROCmSoftwarePlatform/rccl-tests.git) + + # After the following call, the CMake targets defined by googletest and Catch2 will be + # available to the rest of the build + fetchcontent_makeavailable(rccl-tests) + + get_filename_component(rccl_ROOT_DIR "${rccl_INCLUDE_DIR}" DIRECTORY) + + rccl_message(STATUS "Building rccl-tests...") + execute_process( + COMMAND make HIP_HOME=${ROCM_PATH} RCCL_HOME=${rccl_ROOT_DIR} + WORKING_DIRECTORY ${CMAKE_BINARY_DIR}/_deps/rccl-tests-src + RESULT_VARIABLE _RCCL_BUILD_RET + ERROR_VARIABLE _RCCL_BUILD_ERR + OUTPUT_VARIABLE _RCCL_BUILD_OUT + OUTPUT_STRIP_TRAILING_WHITESPACE ERROR_STRIP_TRAILING_WHITESPACE) + + if(NOT _RCCL_BUILD_RET EQUAL 0) + rccl_message(${_RCCL_BUILD_OUT}) + rccl_message(AUTHOR_WARNING "Failed to build rccl-tests: ${_RCCL_BUILD_ERR}") + else() + file(GLOB RCCL_TEST_EXECUTABLES + ${CMAKE_BINARY_DIR}/_deps/rccl-tests-src/build/*_perf) + set(_RCCL_TEST_TARGETS) + + foreach(_EXE ${RCCL_TEST_EXECUTABLES}) + get_filename_component(_EXE_NAME "${_EXE}" NAME) + execute_process(COMMAND ${CMAKE_COMMAND} -E copy ${_EXE} + ${CMAKE_CURRENT_BINARY_DIR}/${_EXE_NAME}) + add_executable(rccl-tests::${_EXE_NAME} IMPORTED GLOBAL) + set_property( + TARGET rccl-tests::${_EXE_NAME} + PROPERTY IMPORTED_LOCATION ${CMAKE_CURRENT_BINARY_DIR}/${_EXE_NAME}) + list(APPEND _RCCL_TEST_TARGETS "rccl-tests::${_EXE_NAME}") + endforeach() + + set(RCCL_TEST_TARGETS + "${_RCCL_TEST_TARGETS}" + CACHE INTERNAL "rccl-test targets") + endif() +else() + rccl_message(AUTHOR_WARNING "${PROJECT_NAME} skipped. Missing RCCL and/or HIP...") +endif() diff --git a/examples/transpose/CMakeLists.txt b/examples/transpose/CMakeLists.txt index 72962cecb..828e0be93 100644 --- a/examples/transpose/CMakeLists.txt +++ b/examples/transpose/CMakeLists.txt @@ -40,13 +40,16 @@ endif() add_executable(transpose transpose.cpp) -if(CMAKE_CXX_COMPILER_ID MATCHES "Clang") +if(CMAKE_CXX_COMPILER_ID MATCHES "Clang" + AND NOT CMAKE_CXX_COMPILER_IS_HIPCC + AND NOT HIPCC_EXECUTABLE) target_link_libraries( transpose - PRIVATE - $,omnitrace::omnitrace-compile-options,> - $,hip::host,> - $,hip::device,>) + PRIVATE $ + $ $) +elseif(CMAKE_CXX_COMPILER_ID MATCHES "Clang") + target_link_libraries( + transpose PRIVATE $) else() target_compile_options(transpose PRIVATE -W -Wall) endif() diff --git a/source/bin/omnitrace/details.cpp b/source/bin/omnitrace/details.cpp index e63aaaaa8..4a1bbe155 100644 --- a/source/bin/omnitrace/details.cpp +++ b/source/bin/omnitrace/details.cpp @@ -52,7 +52,13 @@ get_whole_function_names() "rocr::core::BusyWaitSignal::WaitAcquire", "rocr::core::BusyWaitSignal::WaitRelaxed", "rocr::HSA::hsa_signal_wait_scacquire", "rocr::os::ThreadTrampoline", "rocr::image::ImageRuntime::CreateImageManager", - "rocr::AMD::GpuAgent::GetInfo", "rocr::HSA::hsa_agent_get_info", "event_base_loop" + "rocr::AMD::GpuAgent::GetInfo", "rocr::HSA::hsa_agent_get_info", + "event_base_loop", "bootstrapRoot", "bootstrapNetAccept", "ncclCommInitRank", + "ncclCommInitAll", "ncclCommDestroy", "ncclCommCount", "ncclCommCuDevice", + "ncclCommUserRank", "ncclReduce", "ncclBcast", "ncclBroadcast", "ncclAllReduce", + "ncclReduceScatter", "ncclAllGather", "ncclGroupStart", "ncclGroupEnd", + "ncclSend", "ncclRecv", "ncclGather", "ncclScatter", "ncclAllToAll", + "ncclAllToAllv" }; #else // should hopefully be removed soon diff --git a/source/bin/omnitrace/module_function.cpp b/source/bin/omnitrace/module_function.cpp index e693d28d1..39a3f4b45 100644 --- a/source/bin/omnitrace/module_function.cpp +++ b/source/bin/omnitrace/module_function.cpp @@ -429,6 +429,7 @@ module_function::is_routine_constrained() const static std::regex exclude( "(omnitrace|tim::|N3tim|MPI_Init|MPI_Finalize|dyninst|tm_clones)", regex_opts); + static std::regex exclude_printf("(|v|f)printf$", regex_opts); static std::regex exclude_cxx( "(std::_Sp_counted_base|std::(use|has)_facet|std::locale|::sentry|^std::_|::_(M|" "S)_|::basic_string[a-zA-Z,<>: ]+::_M_create|::__|::_(Alloc|State)|" @@ -456,6 +457,11 @@ module_function::is_routine_constrained() const return _report("Excluding", "critical", 3); } + if(std::regex_search(function_name, exclude_printf)) + { + return _report("Excluding", "critical-printf", 3); + } + if(whole.count(function_name) > 0) { return _report("Excluding", "critical-whole-match", 3); diff --git a/source/lib/omnitrace/CMakeLists.txt b/source/lib/omnitrace/CMakeLists.txt index bf75fdf2e..f08a1279e 100644 --- a/source/lib/omnitrace/CMakeLists.txt +++ b/source/lib/omnitrace/CMakeLists.txt @@ -34,6 +34,7 @@ target_link_libraries( $ $ $ + $ $,omnitrace::omnitrace-lto,>> $,omnitrace::omnitrace-static-libgcc,>> $,omnitrace::omnitrace-static-libstdcxx,>> @@ -100,6 +101,7 @@ set(library_headers ${CMAKE_CURRENT_LIST_DIR}/library/perfetto.hpp ${CMAKE_CURRENT_LIST_DIR}/library/process_sampler.hpp ${CMAKE_CURRENT_LIST_DIR}/library/ptl.hpp + ${CMAKE_CURRENT_LIST_DIR}/library/rcclp.hpp ${CMAKE_CURRENT_LIST_DIR}/library/rocm.hpp ${CMAKE_CURRENT_LIST_DIR}/library/rocprofiler.hpp ${CMAKE_CURRENT_LIST_DIR}/library/roctracer.hpp @@ -118,6 +120,7 @@ set(library_headers ${CMAKE_CURRENT_LIST_DIR}/library/components/functors.hpp ${CMAKE_CURRENT_LIST_DIR}/library/components/mpi_gotcha.hpp ${CMAKE_CURRENT_LIST_DIR}/library/components/omnitrace.hpp + ${CMAKE_CURRENT_LIST_DIR}/library/components/rcclp.hpp ${CMAKE_CURRENT_LIST_DIR}/library/components/rocm_smi.hpp ${CMAKE_CURRENT_LIST_DIR}/library/components/rocprofiler.hpp ${CMAKE_CURRENT_LIST_DIR}/library/components/roctracer.hpp @@ -143,6 +146,13 @@ if(OMNITRACE_USE_ROCTRACER) ${CMAKE_CURRENT_LIST_DIR}/library/roctracer.cpp) endif() +if(OMNITRACE_USE_RCCL) + target_sources( + omnitrace-object-library + PRIVATE ${CMAKE_CURRENT_LIST_DIR}/library/components/rcclp.cpp + ${CMAKE_CURRENT_LIST_DIR}/library/rcclp.cpp) +endif() + if(OMNITRACE_USE_ROCPROFILER) target_sources( omnitrace-object-library diff --git a/source/lib/omnitrace/library.cpp b/source/lib/omnitrace/library.cpp index c512e0977..7aec59d75 100644 --- a/source/lib/omnitrace/library.cpp +++ b/source/lib/omnitrace/library.cpp @@ -41,6 +41,7 @@ #include "library/ompt.hpp" #include "library/process_sampler.hpp" #include "library/ptl.hpp" +#include "library/rcclp.hpp" #include "library/rocprofiler.hpp" #include "library/sampling.hpp" #include "library/thread_data.hpp" @@ -647,6 +648,12 @@ omnitrace_init_tooling_hidden() ompt::setup(); } + if(get_use_rcclp()) + { + OMNITRACE_VERBOSE_F(1, "Setting up RCCLP...\n"); + rcclp::setup(); + } + if(get_use_perfetto() && !is_system_backend()) { #if defined(CUSTOM_DATA_SOURCE) @@ -840,6 +847,12 @@ omnitrace_finalize_hidden(void) } } + if(get_use_rcclp()) + { + OMNITRACE_VERBOSE_F(1, "Shutting down RCCLP...\n"); + rcclp::shutdown(); + } + if(get_use_ompt()) { OMNITRACE_VERBOSE_F(1, "Shutting down OMPT...\n"); diff --git a/source/lib/omnitrace/library/common.hpp b/source/lib/omnitrace/library/common.hpp index 64b13e818..1cc7832e8 100644 --- a/source/lib/omnitrace/library/common.hpp +++ b/source/lib/omnitrace/library/common.hpp @@ -47,6 +47,7 @@ TIMEMORY_DEFINE_NS_API(api, omnitrace) TIMEMORY_DEFINE_NS_API(api, sampling) TIMEMORY_DEFINE_NS_API(api, rocm_smi) +TIMEMORY_DEFINE_NS_API(api, rccl) namespace omnitrace { diff --git a/source/lib/omnitrace/library/components/fwd.hpp b/source/lib/omnitrace/library/components/fwd.hpp index d8e6abd5d..26e578c08 100644 --- a/source/lib/omnitrace/library/components/fwd.hpp +++ b/source/lib/omnitrace/library/components/fwd.hpp @@ -22,6 +22,7 @@ #pragma once +#include "library/common.hpp" #include "library/defines.hpp" #include @@ -32,6 +33,7 @@ #include #include #include +#include #include @@ -40,6 +42,10 @@ TIMEMORY_DEFINE_NS_API(category, process_sampling) TIMEMORY_DECLARE_COMPONENT(roctracer) TIMEMORY_DECLARE_COMPONENT(rocprofiler) +TIMEMORY_DECLARE_COMPONENT(rccl_comm_data) +TIMEMORY_DECLARE_COMPONENT(rcclp_handle) +TIMEMORY_COMPONENT_ALIAS(rccl_api_t, api::rccl) +TIMEMORY_COMPONENT_ALIAS(rccl_data_tracker_t, data_tracker) /// \struct tim::trait::name /// \brief provides a constexpr string in ::value @@ -74,6 +80,7 @@ TIMEMORY_DEFINE_NS_API(category, pthread) TIMEMORY_DEFINE_NS_API(category, kokkos) TIMEMORY_DEFINE_NS_API(category, mpi) TIMEMORY_DEFINE_NS_API(category, ompt) +TIMEMORY_DEFINE_NS_API(category, rccl) TIMEMORY_DEFINE_NS_API(category, critical_trace) TIMEMORY_DEFINE_NS_API(category, host_critical_trace) TIMEMORY_DEFINE_NS_API(category, device_critical_trace) @@ -93,6 +100,7 @@ TIMEMORY_DEFINE_NAME_TRAIT("pthread", category::pthread); TIMEMORY_DEFINE_NAME_TRAIT("kokkos", category::kokkos); TIMEMORY_DEFINE_NAME_TRAIT("mpi", category::mpi); TIMEMORY_DEFINE_NAME_TRAIT("ompt", category::ompt); +TIMEMORY_DEFINE_NAME_TRAIT("rccl", category::rccl); TIMEMORY_DEFINE_NAME_TRAIT("critical-trace", category::critical_trace); TIMEMORY_DEFINE_NAME_TRAIT("host-critical-trace", category::host_critical_trace); TIMEMORY_DEFINE_NAME_TRAIT("device-critical-trace", category::device_critical_trace); @@ -150,6 +158,13 @@ TIMEMORY_DEFINE_CONCRETE_TRAIT(is_available, component::roctracer, false_type) TIMEMORY_DEFINE_CONCRETE_TRAIT(is_available, component::rocprofiler, false_type) #endif +#if !defined(OMNITRACE_USE_RCCL) +TIMEMORY_DEFINE_CONCRETE_TRAIT(is_available, api::rccl, false_type) +TIMEMORY_DEFINE_CONCRETE_TRAIT(is_available, component::rccl_comm_data, false_type) +TIMEMORY_DEFINE_CONCRETE_TRAIT(is_available, component::rccl_data_tracker_t, false_type) +TIMEMORY_DEFINE_CONCRETE_TRAIT(is_available, component::rcclp_handle, false_type) +#endif + #if !defined(TIMEMORY_USE_LIBUNWIND) TIMEMORY_DEFINE_CONCRETE_TRAIT(is_available, omnitrace::api::sampling, false_type) TIMEMORY_DEFINE_CONCRETE_TRAIT(is_available, omnitrace::component::backtrace, false_type) diff --git a/source/lib/omnitrace/library/components/rcclp.cpp b/source/lib/omnitrace/library/components/rcclp.cpp new file mode 100644 index 000000000..4f42487d1 --- /dev/null +++ b/source/lib/omnitrace/library/components/rcclp.cpp @@ -0,0 +1,269 @@ +// MIT License +// +// Copyright (c) 2022 Advanced Micro Devices, Inc. All Rights Reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. + +#include "library/components/rcclp.hpp" +#include "library/rcclp.hpp" + +#include + +std::ostream& +operator<<(std::ostream& _os, const ncclUniqueId& _v) +{ + for(auto itr : _v.internal) + _os << itr; + return _os; +} + +namespace tim +{ +namespace component +{ +uint64_t +activate_rcclp() +{ + using handle_t = tim::component::rcclp_handle; + + static auto _handle = std::shared_ptr{}; + + if(!_handle.get()) + { + _handle = std::make_shared(); + _handle->start(); + + auto cleanup_functor = [=]() { + if(_handle) + { + _handle->stop(); + _handle.reset(); + } + }; + + std::stringstream ss; + ss << "timemory-rcclp-" << demangle() << "-" + << demangle(); + tim::manager::instance()->add_cleanup(ss.str(), cleanup_functor); + return 1; + } + return 0; +} +// +//======================================================================================// +// +uint64_t +deactivate_rcclp(uint64_t id) +{ + if(id > 0) + { + std::stringstream ss; + ss << "timemory-rcclp-" << demangle() << "-" + << demangle(); + tim::manager::instance()->cleanup(ss.str()); + return 0; + } + return 1; +} +// +//======================================================================================// +// +void +configure_rcclp(const std::set& permit, const std::set& reject) +{ + static constexpr size_t rcclp_wrapper_count = OMNITRACE_NUM_RCCLP_WRAPPERS; + + using rcclp_gotcha_t = + tim::component::gotcha; + + static bool is_initialized = false; + if(!is_initialized) + { + // generate the gotcha wrappers + rcclp_gotcha_t::get_initializer() = []() { + // TIMEMORY_C_GOTCHA(rcclp_gotcha_t, 0, ncclGetVersion); + // TIMEMORY_C_GOTCHA(rcclp_gotcha_t, 1, ncclGetUniqueId); + TIMEMORY_C_GOTCHA(rcclp_gotcha_t, 2, ncclCommInitRank); + TIMEMORY_C_GOTCHA(rcclp_gotcha_t, 3, ncclCommInitAll); + TIMEMORY_C_GOTCHA(rcclp_gotcha_t, 4, ncclCommDestroy); + TIMEMORY_C_GOTCHA(rcclp_gotcha_t, 5, ncclCommCount); + TIMEMORY_C_GOTCHA(rcclp_gotcha_t, 6, ncclCommCuDevice); + TIMEMORY_C_GOTCHA(rcclp_gotcha_t, 7, ncclCommUserRank); + TIMEMORY_C_GOTCHA(rcclp_gotcha_t, 8, ncclReduce); + TIMEMORY_C_GOTCHA(rcclp_gotcha_t, 9, ncclBcast); + TIMEMORY_C_GOTCHA(rcclp_gotcha_t, 10, ncclBroadcast); + TIMEMORY_C_GOTCHA(rcclp_gotcha_t, 11, ncclAllReduce); + TIMEMORY_C_GOTCHA(rcclp_gotcha_t, 12, ncclReduceScatter); + TIMEMORY_C_GOTCHA(rcclp_gotcha_t, 13, ncclAllGather); + TIMEMORY_C_GOTCHA(rcclp_gotcha_t, 14, ncclGroupStart); + TIMEMORY_C_GOTCHA(rcclp_gotcha_t, 15, ncclGroupEnd); + TIMEMORY_C_GOTCHA(rcclp_gotcha_t, 16, ncclSend); + TIMEMORY_C_GOTCHA(rcclp_gotcha_t, 17, ncclRecv); + TIMEMORY_C_GOTCHA(rcclp_gotcha_t, 18, ncclGather); + TIMEMORY_C_GOTCHA(rcclp_gotcha_t, 19, ncclScatter); + TIMEMORY_C_GOTCHA(rcclp_gotcha_t, 20, ncclAllToAll); + TIMEMORY_C_GOTCHA(rcclp_gotcha_t, 21, ncclAllToAllv); + // TIMEMORY_C_GOTCHA(rcclp_gotcha_t, 22, ncclRedOpCreatePreMulSum); + // TIMEMORY_C_GOTCHA(rcclp_gotcha_t, 23, ncclRedOpDestroy); + }; + + // provide environment variable for suppressing wrappers + rcclp_gotcha_t::get_reject_list() = [reject]() { + auto _reject = reject; + // check environment + auto reject_list = + tim::get_env("OMNITRACE_RCCLP_REJECT_LIST", ""); + // add environment setting + for(const auto& itr : tim::delimit(reject_list)) + _reject.insert(itr); + return _reject; + }; + + // provide environment variable for selecting wrappers + rcclp_gotcha_t::get_permit_list() = [permit]() { + auto _permit = permit; + // check environment + auto permit_list = + tim::get_env("OMNITRACE_RCCLP_PERMIT_LIST", ""); + // add environment setting + for(const auto& itr : tim::delimit(permit_list)) + _permit.insert(itr); + return _permit; + }; + + is_initialized = true; + } +} + +void +rcclp_handle::start() +{ + if(get_tool_count()++ == 0) + { + get_tool_instance() = std::make_shared("timemory_rcclp"); + get_tool_instance()->start(); + } +} + +void +rcclp_handle::stop() +{ + auto idx = --get_tool_count(); + if(get_tool_instance().get()) + { + get_tool_instance()->stop(); + if(idx == 0) get_tool_instance().reset(); + } +} + +rcclp_handle::persistent_data& +rcclp_handle::get_persistent_data() +{ + static persistent_data _instance; + return _instance; +} + +std::atomic& +rcclp_handle::get_configured() +{ + return get_persistent_data().m_configured; +} + +rcclp_handle::toolset_ptr_t& +rcclp_handle::get_tool_instance() +{ + return get_persistent_data().m_tool; +} + +std::atomic& +rcclp_handle::get_tool_count() +{ + return get_persistent_data().m_count; +} + +void +rccl_comm_data::preinit() +{ + omnitrace::rcclp::configure(); +} + +// ncclReduce +void +rccl_comm_data::audit(const gotcha_data& _data, audit::incoming, const void*, void*, + size_t count, ncclDataType_t datatype, ncclRedOp_t, int root, + ncclComm_t, hipStream_t) +{ + int size = rccl_type_size(datatype); + add(_data, count * size, JOIN('_', _data.tool_id.c_str(), "root", root)); +} + +// ncclSend +void +rccl_comm_data::audit(const gotcha_data& _data, audit::incoming, const void*, + size_t count, ncclDataType_t datatype, int peer, ncclComm_t, + hipStream_t) +{ + int size = rccl_type_size(datatype); + add(_data, count * size, JOIN('_', _data.tool_id.c_str(), "root", peer)); +} + +// ncclBcast +// ncclRecv +void +rccl_comm_data::audit(const gotcha_data& _data, audit::incoming, void*, size_t count, + ncclDataType_t datatype, int root, ncclComm_t, hipStream_t) +{ + int size = rccl_type_size(datatype); + add(_data, count * size, JOIN('_', _data.tool_id.c_str(), "root", root)); +} + +// ncclBroadcast +void +rccl_comm_data::audit(const gotcha_data& _data, audit::incoming, const void*, void*, + size_t count, ncclDataType_t datatype, int root, ncclComm_t, + hipStream_t) +{ + int size = rccl_type_size(datatype); + add(_data, count * size, JOIN('_', _data.tool_id.c_str(), "root", root)); +} + +// ncclAllReduce +// ncclReduceScatter +void +rccl_comm_data::audit(const gotcha_data& _data, audit::incoming, const void*, void*, + size_t count, ncclDataType_t datatype, ncclRedOp_t, ncclComm_t, + hipStream_t) +{ + int size = rccl_type_size(datatype); + add(_data, count * size); +} + +// ncclAllGather +void +rccl_comm_data::audit(const gotcha_data& _data, audit::incoming, const void*, void*, + size_t count, ncclDataType_t datatype, ncclComm_t, hipStream_t) +{ + int size = rccl_type_size(datatype); + add(_data, count * size); +} + +} // namespace component +} // namespace tim + +TIMEMORY_INITIALIZE_STORAGE(rccl_comm_data, rccl_data_tracker_t) diff --git a/source/lib/omnitrace/library/components/rcclp.hpp b/source/lib/omnitrace/library/components/rcclp.hpp new file mode 100644 index 000000000..172ec97f0 --- /dev/null +++ b/source/lib/omnitrace/library/components/rcclp.hpp @@ -0,0 +1,220 @@ +// MIT License +// +// Copyright (c) 2020, The Regents of the University of California, +// through Lawrence Berkeley National Laboratory (subject to receipt of any +// required approvals from the U.S. Dept. of Energy). All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. + +#include "library/common.hpp" +#include "library/components/category_region.hpp" +#include "library/components/fwd.hpp" +#include "library/defines.hpp" +#include "library/timemory.hpp" + +#include +#include + +#if OMNITRACE_HIP_VERSION == 0 || OMNITRACE_HIP_VERSION >= 50200 +# include +#else +# include +#endif + +#include +#include +#include +#include +#include +#include + +#if !defined(OMNITRACE_NUM_RCCLP_WRAPPERS) +# define OMNITRACE_NUM_RCCLP_WRAPPERS 25 +#endif + +TIMEMORY_COMPONENT_ALIAS( + rccl_toolset_t, + component_bundle, + rccl_comm_data*>) +TIMEMORY_COMPONENT_ALIAS(rcclp_gotcha_t, + gotcha) + +#if !defined(OMNITRACE_USE_RCCL) +TIMEMORY_DEFINE_CONCRETE_TRAIT(is_available, component::rcclp_gotcha_t, false_type) +#endif + +TIMEMORY_STATISTICS_TYPE(component::rccl_data_tracker_t, float) +TIMEMORY_DEFINE_CONCRETE_TRAIT(uses_memory_units, component::rccl_data_tracker_t, + true_type) +TIMEMORY_DEFINE_CONCRETE_TRAIT(is_memory_category, component::rccl_data_tracker_t, + true_type) + +namespace tim +{ +namespace component +{ +uint64_t +activate_rcclp(); + +uint64_t +deactivate_rcclp(uint64_t id); + +void +configure_rcclp(const std::set& permit = {}, + const std::set& reject = {}); + +struct rcclp_handle : base +{ + static constexpr size_t rcclp_wrapper_count = OMNITRACE_NUM_RCCLP_WRAPPERS; + + using value_type = void; + using this_type = rcclp_handle; + using base_type = base; + + using rcclp_tuple_t = tim::component_tuple; + using toolset_ptr_t = std::shared_ptr; + + static std::string label() { return "rcclp_handle"; } + static std::string description() { return "Handle for activating NCCL wrappers"; } + static void get() {} + static void start(); + static void stop(); + static int get_count() { return get_tool_count().load(); } + +private: + struct persistent_data + { + std::atomic m_configured{ 0 }; + std::atomic m_count{ 0 }; + toolset_ptr_t m_tool = toolset_ptr_t{}; + }; + + static persistent_data& get_persistent_data(); + static std::atomic& get_configured(); + static toolset_ptr_t& get_tool_instance(); + static std::atomic& get_tool_count(); +}; + +struct rccl_comm_data : base +{ + using value_type = void; + using this_type = rccl_comm_data; + using base_type = base; + using tracker_t = tim::auto_tuple; + using data_type = float; + + TIMEMORY_DEFAULT_OBJECT(rccl_comm_data) + + static void preinit(); + static void start() {} + static void stop() {} + + static auto rccl_type_size(ncclDataType_t datatype) + { + switch(datatype) + { + case ncclInt8: + case ncclUint8: return 1; + case ncclFloat16: return 2; + case ncclInt32: + case ncclUint32: + case ncclFloat32: return 4; + case ncclInt64: + case ncclUint64: + case ncclFloat64: return 8; + default: return 0; + }; + } + + // ncclReduce + static void audit(const gotcha_data& _data, audit::incoming, const void*, void*, + size_t count, ncclDataType_t datatype, ncclRedOp_t, int root, + ncclComm_t, hipStream_t); + + // ncclSend + static void audit(const gotcha_data& _data, audit::incoming, const void*, + size_t count, ncclDataType_t datatype, int peer, ncclComm_t, + hipStream_t); + + // ncclBcast + // ncclRecv + static void audit(const gotcha_data& _data, audit::incoming, void*, size_t count, + ncclDataType_t datatype, int root, ncclComm_t, hipStream_t); + + // ncclBroadcast + static void audit(const gotcha_data& _data, audit::incoming, const void*, void*, + size_t count, ncclDataType_t datatype, int root, ncclComm_t, + hipStream_t); + + // ncclAllReduce + // ncclReduceScatter + static void audit(const gotcha_data& _data, audit::incoming, const void*, void*, + size_t count, ncclDataType_t datatype, ncclRedOp_t, ncclComm_t, + hipStream_t); + + // ncclAllGather + static void audit(const gotcha_data& _data, audit::incoming, const void*, void*, + size_t count, ncclDataType_t datatype, ncclComm_t, hipStream_t); + +private: + template + static void add(tracker_t& _t, data_type value, Args&&... args) + { + _t.store(std::plus{}, value); + TIMEMORY_FOLD_EXPRESSION(add_secondary(_t, std::forward(args), value)); + } + + template + static void add(const gotcha_data& _data, data_type value, Args&&... args) + { + tracker_t _t{ std::string_view{ _data.tool_id.c_str() } }; + add(_t, value, std::forward(args)...); + } + + template + static void add_secondary(tracker_t&, const gotcha_data& _data, data_type value, + Args&&... args) + { + // if(tim::settings::add_secondary()) + { + tracker_t _s{ std::string_view{ _data.tool_id.c_str() } }; + add(_s, _data, value, std::forward(args)...); + } + } + + template + static void add(std::string_view _name, data_type value, Args&&... args) + { + tracker_t _t{ _name }; + add(_t, value, std::forward(args)...); + } + + template + static void add_secondary(tracker_t&, std::string_view _name, data_type value, + Args&&... args) + { + // if(tim::settings::add_secondary()) + { + tracker_t _s{ _name }; + add(_s, value, std::forward(args)...); + } + } +}; +} // namespace component +} // namespace tim diff --git a/source/lib/omnitrace/library/config.cpp b/source/lib/omnitrace/library/config.cpp index 1fa362fa4..7d82292f7 100644 --- a/source/lib/omnitrace/library/config.cpp +++ b/source/lib/omnitrace/library/config.cpp @@ -24,6 +24,7 @@ #include "library/debug.hpp" #include "library/defines.hpp" #include "library/gpu.hpp" +#include "library/mproc.hpp" #include "library/perfetto.hpp" #include "library/runtime.hpp" @@ -46,9 +47,11 @@ #include #include #include +#include #include #include #include +#include #include #include @@ -270,6 +273,11 @@ configure_settings(bool _init) "Enable support for Kokkos Tools", false, "kokkos", "backend"); + OMNITRACE_CONFIG_SETTING( + bool, "OMNITRACE_USE_RCCLP", + "Enable support for ROCm Communication Collectives Library (RCCL) Performance", + false, "rocm", "rccl", "backend"); + OMNITRACE_CONFIG_CL_SETTING( bool, "OMNITRACE_KOKKOS_KERNEL_LOGGER", "Enables kernel logging", false, "--omnitrace-kokkos-kernel-logger", "kokkos", "debugging"); @@ -582,12 +590,30 @@ configure_settings(bool _init) } if(!_found_sep && _cmd.size() > 1) _cmd.insert(_cmd.begin() + 1, "--"); + auto _pid = getpid(); + auto _ppid = getppid(); + auto _proc = mproc::get_concurrent_processes(_ppid); + bool _main_proc = (_proc.size() < 2 || *_proc.begin() == _pid); + for(auto&& itr : tim::delimit(_config->get("OMNITRACE_CONFIG_FILE"), ";:")) { if(_config->get_suppress_config()) continue; OMNITRACE_BASIC_VERBOSE(1, "Reading config file %s\n", itr.c_str()); _config->read(itr); + if(_config->get("OMNITRACE_CI") && _main_proc) + { + std::ifstream _in{ itr }; + std::stringstream _iss{}; + while(_in) + { + std::string _s{}; + getline(_in, _s); + _iss << _s << "\n"; + } + OMNITRACE_BASIC_PRINT("config file '%s':\n%s\n", itr.c_str(), + _iss.str().c_str()); + } } settings::suppress_config() = true; @@ -666,6 +692,7 @@ configure_mode_settings() _set("OMNITRACE_USE_ROCTRACER", false); _set("OMNITRACE_USE_ROCPROFILER", false); _set("OMNITRACE_USE_KOKKOSP", false); + _set("OMNITRACE_USE_RCCLP", false); _set("OMNITRACE_USE_OMPT", false); _set("OMNITRACE_USE_SAMPLING", false); _set("OMNITRACE_USE_PROCESS_SAMPLING", false); @@ -721,6 +748,7 @@ configure_mode_settings() _set("OMNITRACE_USE_ROCTRACER", false); _set("OMNITRACE_USE_ROCPROFILER", false); _set("OMNITRACE_USE_KOKKOSP", false); + _set("OMNITRACE_USE_RCCLP", false); _set("OMNITRACE_USE_OMPT", false); _set("OMNITRACE_USE_SAMPLING", false); _set("OMNITRACE_USE_PROCESS_SAMPLING", false); @@ -817,6 +845,7 @@ configure_disabled_settings() _handle_use_option("OMNITRACE_USE_PERFETTO", "perfetto"); _handle_use_option("OMNITRACE_USE_TIMEMORY", "timemory"); _handle_use_option("OMNITRACE_USE_OMPT", "ompt"); + _handle_use_option("OMNITRACE_USE_RCCLP", "rcclp"); _handle_use_option("OMNITRACE_USE_ROCM_SMI", "rocm_smi"); _handle_use_option("OMNITRACE_USE_ROCTRACER", "roctracer"); _handle_use_option("OMNITRACE_USE_ROCPROFILER", "rocprofiler"); @@ -1355,6 +1384,13 @@ get_use_code_coverage() return static_cast&>(*_v->second).get(); } +bool +get_use_rcclp() +{ + static auto _v = get_config()->find("OMNITRACE_USE_RCCLP"); + return static_cast&>(*_v->second).get(); +} + bool get_critical_trace_debug() { diff --git a/source/lib/omnitrace/library/config.hpp b/source/lib/omnitrace/library/config.hpp index 3b0d2bbe2..0fc21f699 100644 --- a/source/lib/omnitrace/library/config.hpp +++ b/source/lib/omnitrace/library/config.hpp @@ -216,6 +216,9 @@ get_use_sampling_cputime(); int get_sampling_rtoffset(); +bool +get_use_rcclp(); + bool get_timeline_sampling(); diff --git a/source/lib/omnitrace/library/perfetto.hpp b/source/lib/omnitrace/library/perfetto.hpp index c237c3df8..f9d7c81b0 100644 --- a/source/lib/omnitrace/library/perfetto.hpp +++ b/source/lib/omnitrace/library/perfetto.hpp @@ -83,6 +83,8 @@ perfetto::Category("kokkos").SetDescription("Kokkos regions"), \ perfetto::Category("mpi").SetDescription("MPI regions"), \ perfetto::Category("ompt").SetDescription("OpenMP Tools regions"), \ + perfetto::Category("rccl").SetDescription( \ + "ROCm Communication Collectives Library (RCCL) regions"), \ perfetto::Category("critical-trace").SetDescription("Combined critical traces"), \ perfetto::Category("host-critical-trace") \ .SetDescription("Host-side critical traces"), \ diff --git a/source/lib/omnitrace/library/rcclp.cpp b/source/lib/omnitrace/library/rcclp.cpp new file mode 100644 index 000000000..420f5b72b --- /dev/null +++ b/source/lib/omnitrace/library/rcclp.cpp @@ -0,0 +1,88 @@ +// MIT License +// +// Copyright (c) 2020, The Regents of the University of California, +// through Lawrence Berkeley National Laboratory (subject to receipt of any +// required approvals from the U.S. Dept. of Energy). All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. + +#include "library/components/rcclp.hpp" +#include "library/components/category_region.hpp" +#include "library/components/fwd.hpp" +#include "library/defines.hpp" +#include "library/timemory.hpp" + +#include + +#if OMNITRACE_HIP_VERSION == 0 || OMNITRACE_HIP_VERSION >= 50200 +# include +#else +# include +#endif + +#include +#include +#include +#include +#include + +static uint64_t global_id = std::numeric_limits::max(); +static void* librccl_handle = nullptr; + +namespace omnitrace +{ +namespace rcclp +{ +void +configure() +{ + comp::rccl_data_tracker_t::label() = "rccl_comm_data"; + comp::rccl_data_tracker_t::description() = "Tracks RCCL communication data"; +} + +void +setup() +{ + configure(); + + // make sure the symbols are loaded to be wrapped + auto libpath = tim::get_env("OMNITRACE_RCCL_LIBRARY", "librccl.so"); + librccl_handle = dlopen(libpath.c_str(), RTLD_NOW | RTLD_GLOBAL); + if(!librccl_handle) fprintf(stderr, "%s\n", dlerror()); + dlerror(); // Clear any existing error + + auto _data = tim::get_env("OMNITRACE_RCCLP_COMM_DATA", true); + if(_data) + comp::rccl_toolset_t::get_initializer() = [](comp::rccl_toolset_t& cb) { + cb.initialize(); + }; + + comp::configure_rcclp(); + global_id = comp::activate_rcclp(); + if(librccl_handle) dlclose(librccl_handle); +} + +void +shutdown() +{ + if(global_id < std::numeric_limits::max()) + comp::deactivate_rcclp(global_id); +} +} // namespace rcclp +} // namespace omnitrace diff --git a/source/lib/omnitrace/library/rcclp.hpp b/source/lib/omnitrace/library/rcclp.hpp new file mode 100644 index 000000000..1b04559d8 --- /dev/null +++ b/source/lib/omnitrace/library/rcclp.hpp @@ -0,0 +1,57 @@ +// MIT License +// +// Copyright (c) 2020, The Regents of the University of California, +// through Lawrence Berkeley National Laboratory (subject to receipt of any +// required approvals from the U.S. Dept. of Energy). All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. + +#pragma once + +#include "library/defines.hpp" + +namespace omnitrace +{ +namespace rcclp +{ +void +configure(); + +void +setup(); + +void +shutdown(); + +#if !defined(OMNITRACE_USE_RCCL) || \ + (defined(OMNITRACE_USE_RCCL) && OMNITRACE_USE_RCCL == 0) +inline void +configure() +{} + +inline void +setup() +{} + +inline void +shutdown() +{} +#endif +} // namespace rcclp +} // namespace omnitrace diff --git a/source/lib/omnitrace/library/tpls/rccl/rccl/rccl.h b/source/lib/omnitrace/library/tpls/rccl/rccl/rccl.h new file mode 100644 index 000000000..5fb23b1ab --- /dev/null +++ b/source/lib/omnitrace/library/tpls/rccl/rccl/rccl.h @@ -0,0 +1,522 @@ +/************************************************************************* + * Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved. + * Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#ifndef NCCL_H_ +#define NCCL_H_ + +#include +#include + +#define NCCL_MAJOR 2 +#define NCCL_MINOR 11 +#define NCCL_PATCH 4 +#define NCCL_SUFFIX "" + +#define NCCL_VERSION_CODE 21104 +#define NCCL_VERSION(X, Y, Z) \ + (((X) <= 2 && (Y) <= 8) ? (X) *1000 + (Y) *100 + (Z) : (X) *10000 + (Y) *100 + (Z)) + +#define RCCL_BFLOAT16 1 +#define RCCL_GATHER_SCATTER 1 +#define RCCL_ALLTOALLV 1 + +#ifdef __cplusplus +extern "C" +{ +#endif + + /*! @brief Opaque handle to communicator */ + typedef struct ncclComm* ncclComm_t; + +#define NCCL_UNIQUE_ID_BYTES 128 + typedef struct + { + char internal[NCCL_UNIQUE_ID_BYTES]; + } ncclUniqueId; + + /*! @brief Error type */ + typedef enum + { + ncclSuccess = 0, + ncclUnhandledCudaError = 1, + ncclSystemError = 2, + ncclInternalError = 3, + ncclInvalidArgument = 4, + ncclInvalidUsage = 5, + ncclNumResults = 6 + } ncclResult_t; + + /*! @brief Return the NCCL_VERSION_CODE of the NCCL library in the supplied integer. + * + * @details This integer is coded with the MAJOR, MINOR and PATCH level of the + * NCCL library + */ + ncclResult_t ncclGetVersion(int* version); + /// @cond include_hidden + ncclResult_t pncclGetVersion(int* version); + /// @endcond + + /*! @brief Generates an ID for ncclCommInitRank + + @details + Generates an ID to be used in ncclCommInitRank. ncclGetUniqueId should be + called once and the Id should be distributed to all ranks in the + communicator before calling ncclCommInitRank. + + @param[in] + uniqueId ncclUniqueId* + pointer to uniqueId + + */ + ncclResult_t ncclGetUniqueId(ncclUniqueId* uniqueId); + /// @cond include_hidden + ncclResult_t pncclGetUniqueId(ncclUniqueId* uniqueId); + /// @endcond + + /*! @brief Creates a new communicator (multi thread/process version). + + @details + rank must be between 0 and nranks-1 and unique within a communicator clique. + Each rank is associated to a CUDA device, which has to be set before calling + ncclCommInitRank. + ncclCommInitRank implicitly syncronizes with other ranks, so it must be + called by different threads/processes or use ncclGroupStart/ncclGroupEnd. + + @param[in] + comm ncclComm_t* + communicator struct pointer + */ + ncclResult_t ncclCommInitRank(ncclComm_t* comm, int nranks, ncclUniqueId commId, + int rank); + /// @cond include_hidden + ncclResult_t pncclCommInitRank(ncclComm_t* comm, int nranks, ncclUniqueId commId, + int rank); + /// @endcond + + /*! @brief Creates a clique of communicators (single process version). + * + * @details This is a convenience function to create a single-process communicator + * clique. Returns an array of ndev newly initialized communicators in comm. comm + * should be pre-allocated with size at least ndev*sizeof(ncclComm_t). If devlist is + * NULL, the first ndev HIP devices are used. Order of devlist defines user-order of + * processors within the communicator. + * */ + ncclResult_t ncclCommInitAll(ncclComm_t* comm, int ndev, const int* devlist); + /// @cond include_hidden + ncclResult_t pncclCommInitAll(ncclComm_t* comm, int ndev, const int* devlist); + /// @endcond + + /*! @brief Frees resources associated with communicator object, but waits for any + * operations that might still be running on the device */ + ncclResult_t ncclCommDestroy(ncclComm_t comm); + /// @cond include_hidden + ncclResult_t pncclCommDestroy(ncclComm_t comm); + /// @endcond + + /*! @brief Frees resources associated with communicator object and aborts any + * operations that might still be running on the device. */ + ncclResult_t ncclCommAbort(ncclComm_t comm); + /// @cond include_hidden + ncclResult_t pncclCommAbort(ncclComm_t comm); + /// @endcond + + /*! @brief Returns a human-readable error message. */ + const char* ncclGetErrorString(ncclResult_t result); + const char* pncclGetErrorString(ncclResult_t result); + + /*! @brief Checks whether the comm has encountered any asynchronous errors */ + ncclResult_t ncclCommGetAsyncError(ncclComm_t comm, ncclResult_t* asyncError); + /// @cond include_hidden + ncclResult_t pncclCommGetAsyncError(ncclComm_t comm, ncclResult_t* asyncError); + /// @endcond + + /*! @brief Gets the number of ranks in the communicator clique. */ + ncclResult_t ncclCommCount(const ncclComm_t comm, int* count); + /// @cond include_hidden + ncclResult_t pncclCommCount(const ncclComm_t comm, int* count); + /// @endcond + + /*! @brief Returns the rocm device number associated with the communicator. */ + ncclResult_t ncclCommCuDevice(const ncclComm_t comm, int* device); + /// @cond include_hidden + ncclResult_t pncclCommCuDevice(const ncclComm_t comm, int* device); + /// @endcond + + /*! @brief Returns the user-ordered "rank" associated with the communicator. */ + ncclResult_t ncclCommUserRank(const ncclComm_t comm, int* rank); + /// @cond include_hidden + ncclResult_t pncclCommUserRank(const ncclComm_t comm, int* rank); + /// @endcond + + /*! @brief Reduction operation selector */ + /* Reduction operation selector */ + typedef enum + { + ncclNumOps_dummy = 5 + } ncclRedOp_dummy_t; + typedef enum + { + ncclSum = 0, + ncclProd = 1, + ncclMax = 2, + ncclMin = 3, + ncclAvg = 4, + /* ncclNumOps: The number of built-in ncclRedOp_t values. Also + * serves as the least possible value for dynamic ncclRedOp_t's + * as constructed by ncclRedOpCreate*** functions. */ + ncclNumOps = 5, + /* ncclMaxRedOp: The largest valid value for ncclRedOp_t. + * It is defined to be the largest signed value (since compilers + * are permitted to use signed enums) that won't grow + * sizeof(ncclRedOp_t) when compared to previous NCCL versions to + * maintain ABI compatibility. */ + ncclMaxRedOp = 0x7fffffff >> (32 - 8 * sizeof(ncclRedOp_dummy_t)) + } ncclRedOp_t; + + /*! @brief Data types */ + typedef enum + { + ncclInt8 = 0, + ncclChar = 0, + ncclUint8 = 1, + ncclInt32 = 2, + ncclInt = 2, + ncclUint32 = 3, + ncclInt64 = 4, + ncclUint64 = 5, + ncclFloat16 = 6, + ncclHalf = 6, + ncclFloat32 = 7, + ncclFloat = 7, + ncclFloat64 = 8, + ncclDouble = 8, + ncclBfloat16 = 9, + ncclNumTypes = 10 + } ncclDataType_t; + + /* ncclScalarResidence_t: Location and dereferencing logic for scalar arguments. */ + typedef enum + { + /* ncclScalarDevice: The scalar is in device-visible memory and will be + * dereferenced while the collective is running. */ + ncclScalarDevice = 0, + + /* ncclScalarHostImmediate: The scalar is in host-visible memory and will be + * dereferenced before the ncclRedOpCreate***() function returns. */ + ncclScalarHostImmediate = 1 + } ncclScalarResidence_t; + + /* + * ncclRedOpCreatePreMulSum + * + * Creates a new reduction operator which pre-multiplies input values by a given + * scalar locally before reducing them with peer values via summation. For use + * only with collectives launched against *comm* and *datatype*. The + * *residence* argument indicates how/when the memory pointed to by *scalar* + * will be dereferenced. Upon return, the newly created operator's handle + * is stored in *op*. + */ + ncclResult_t ncclRedOpCreatePreMulSum(ncclRedOp_t* op, void* scalar, + ncclDataType_t datatype, + ncclScalarResidence_t residence, + ncclComm_t comm); + ncclResult_t pncclRedOpCreatePreMulSum(ncclRedOp_t* op, void* scalar, + ncclDataType_t datatype, + ncclScalarResidence_t residence, + ncclComm_t comm); + + /* + * ncclRedOpDestroy + * + * Destroys the reduction operator *op*. The operator must have been created by + * ncclRedOpCreatePreMul with the matching communicator *comm*. An operator may be + * destroyed as soon as the last NCCL function which is given that operator returns. + */ + ncclResult_t ncclRedOpDestroy(ncclRedOp_t op, ncclComm_t comm); + ncclResult_t pncclRedOpDestroy(ncclRedOp_t op, ncclComm_t comm); + + /* + * Collective communication operations + * + * Collective communication operations must be called separately for each + * communicator in a communicator clique. + * + * They return when operations have been enqueued on the CUDA stream. + * + * Since they may perform inter-CPU synchronization, each call has to be done + * from a different thread or process, or need to use Group Semantics (see + * below). + */ + + /*! + * @brief Reduce + * + * @details Reduces data arrays of length count in sendbuff into recvbuff using op + * operation. + * recvbuff may be NULL on all calls except for root device. + * root is the rank (not the CUDA device) where data will reside after the + * operation is complete. + * + * In-place operation will happen if sendbuff == recvbuff. + */ + ncclResult_t ncclReduce(const void* sendbuff, void* recvbuff, size_t count, + ncclDataType_t datatype, ncclRedOp_t op, int root, + ncclComm_t comm, hipStream_t stream); + /// @cond include_hidden + ncclResult_t pncclReduce(const void* sendbuff, void* recvbuff, size_t count, + ncclDataType_t datatype, ncclRedOp_t op, int root, + ncclComm_t comm, hipStream_t stream); + /// @endcond + + /*! @brief (deprecated) Broadcast (in-place) + * + * @details Copies count values from root to all other devices. + * root is the rank (not the CUDA device) where data resides before the + * operation is started. + * + * This operation is implicitely in place. + */ + ncclResult_t ncclBcast(void* buff, size_t count, ncclDataType_t datatype, int root, + ncclComm_t comm, hipStream_t stream); + /// @cond include_hidden + ncclResult_t pncclBcast(void* buff, size_t count, ncclDataType_t datatype, int root, + ncclComm_t comm, hipStream_t stream); + /// @endcond + + /*! @brief Broadcast + * + * @details Copies count values from root to all other devices. + * root is the rank (not the HIP device) where data resides before the + * operation is started. + * + * In-place operation will happen if sendbuff == recvbuff. + */ + ncclResult_t ncclBroadcast(const void* sendbuff, void* recvbuff, size_t count, + ncclDataType_t datatype, int root, ncclComm_t comm, + hipStream_t stream); + /// @cond include_hidden + ncclResult_t pncclBroadcast(const void* sendbuff, void* recvbuff, size_t count, + ncclDataType_t datatype, int root, ncclComm_t comm, + hipStream_t stream); + /// @endcond + + /*! @brief All-Reduce + * + * @details Reduces data arrays of length count in sendbuff using op operation, and + * leaves identical copies of result on each recvbuff. + * + * In-place operation will happen if sendbuff == recvbuff. + */ + ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, size_t count, + ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, + hipStream_t stream); + /// @cond include_hidden + ncclResult_t pncclAllReduce(const void* sendbuff, void* recvbuff, size_t count, + ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, + hipStream_t stream); + /// @endcond + + /*! + * @brief Reduce-Scatter + * + * @details Reduces data in sendbuff using op operation and leaves reduced result + * scattered over the devices so that recvbuff on rank i will contain the i-th + * block of the result. + * Assumes sendcount is equal to nranks*recvcount, which means that sendbuff + * should have a size of at least nranks*recvcount elements. + * + * In-place operations will happen if recvbuff == sendbuff + rank * recvcount. + */ + ncclResult_t ncclReduceScatter(const void* sendbuff, void* recvbuff, size_t recvcount, + ncclDataType_t datatype, ncclRedOp_t op, + ncclComm_t comm, hipStream_t stream); + /// @cond include_hidden + ncclResult_t pncclReduceScatter(const void* sendbuff, void* recvbuff, + size_t recvcount, ncclDataType_t datatype, + ncclRedOp_t op, ncclComm_t comm, hipStream_t stream); + /// @endcond + + /*! @brief All-Gather + * + * @details Each device gathers sendcount values from other GPUs into recvbuff, + * receiving data from rank i at offset i*sendcount. + * Assumes recvcount is equal to nranks*sendcount, which means that recvbuff + * should have a size of at least nranks*sendcount elements. + * + * In-place operations will happen if sendbuff == recvbuff + rank * sendcount. + */ + ncclResult_t ncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount, + ncclDataType_t datatype, ncclComm_t comm, + hipStream_t stream); + /// @cond include_hidden + ncclResult_t pncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount, + ncclDataType_t datatype, ncclComm_t comm, + hipStream_t stream); + /// @endcond + + /*! @brief Send + * + * @details Send data from sendbuff to rank peer. + * Rank peer needs to call ncclRecv with the same datatype and the same count from + * this rank. + * + * This operation is blocking for the GPU. If multiple ncclSend and ncclRecv + * operations need to progress concurrently to complete, they must be fused within a + * ncclGroupStart/ ncclGroupEnd section. + */ + ncclResult_t ncclSend(const void* sendbuff, size_t count, ncclDataType_t datatype, + int peer, ncclComm_t comm, hipStream_t stream); + /// @cond include_hidden + ncclResult_t pncclSend(const void* sendbuff, size_t count, ncclDataType_t datatype, + int peer, ncclComm_t comm, hipStream_t stream); + /// @endcond + + /*! @brief Receive + * + * @details Receive data from rank peer into recvbuff. + * Rank peer needs to call ncclSend with the same datatype and the same count to this + * rank. + * + * This operation is blocking for the GPU. If multiple ncclSend and ncclRecv + * operations need to progress concurrently to complete, they must be fused within a + * ncclGroupStart/ ncclGroupEnd section. + */ + ncclResult_t ncclRecv(void* recvbuff, size_t count, ncclDataType_t datatype, int peer, + ncclComm_t comm, hipStream_t stream); + /// @cond include_hidden + ncclResult_t pncclRecv(void* recvbuff, size_t count, ncclDataType_t datatype, + int peer, ncclComm_t comm, hipStream_t stream); + /// @endcond + + /*! @brief Gather + * + * @details Root device gathers sendcount values from other GPUs into recvbuff, + * receiving data from rank i at offset i*sendcount. + * + * Assumes recvcount is equal to nranks*sendcount, which means that recvbuff + * should have a size of at least nranks*sendcount elements. + * + * In-place operations will happen if sendbuff == recvbuff + rank * sendcount. + */ + ncclResult_t ncclGather(const void* sendbuff, void* recvbuff, size_t sendcount, + ncclDataType_t datatype, int root, ncclComm_t comm, + hipStream_t stream); + /// @cond include_hidden + ncclResult_t pncclGather(const void* sendbuff, void* recvbuff, size_t sendcount, + ncclDataType_t datatype, int root, ncclComm_t comm, + hipStream_t stream); + /// @endcond + + /*! @brief Scatter + * + * @details Scattered over the devices so that recvbuff on rank i will contain the + * i-th block of the data on root. + * + * Assumes sendcount is equal to nranks*recvcount, which means that sendbuff + * should have a size of at least nranks*recvcount elements. + * + * In-place operations will happen if recvbuff == sendbuff + rank * recvcount. + */ + ncclResult_t ncclScatter(const void* sendbuff, void* recvbuff, size_t recvcount, + ncclDataType_t datatype, int root, ncclComm_t comm, + hipStream_t stream); + /// @cond include_hidden + ncclResult_t pncclScatter(const void* sendbuff, void* recvbuff, size_t recvcount, + ncclDataType_t datatype, int root, ncclComm_t comm, + hipStream_t stream); + /// @endcond + + /*! @brief All-To-All + * + * @details Device (i) send (j)th block of data to device (j) and be placed as (i)th + * block. Each block for sending/receiving has count elements, which means + * that recvbuff and sendbuff should have a size of nranks*count elements. + * + * In-place operation will happen if sendbuff == recvbuff. + */ + ncclResult_t ncclAllToAll(const void* sendbuff, void* recvbuff, size_t count, + ncclDataType_t datatype, ncclComm_t comm, + hipStream_t stream); + /// @cond include_hidden + ncclResult_t pncclAllToAll(const void* sendbuff, void* recvbuff, size_t count, + ncclDataType_t datatype, ncclComm_t comm, + hipStream_t stream); + /// @endcond + + /*! @brief All-To-Allv + * + * @details Device (i) sends sendcounts[j] of data from offset sdispls[j] + * to device (j). In the same time, device (i) receives recvcounts[j] of data + * from device (j) to be placed at rdispls[j]. + + * sendcounts, sdispls, recvcounts and rdispls are all measured in the units + * of datatype, not bytes. + * + * In-place operation will happen if sendbuff == recvbuff. + */ + ncclResult_t ncclAllToAllv(const void* sendbuff, const size_t sendcounts[], + const size_t sdispls[], void* recvbuff, + const size_t recvcounts[], const size_t rdispls[], + ncclDataType_t datatype, ncclComm_t comm, + hipStream_t stream); + /// @cond include_hidden + ncclResult_t pncclAllToAllv(const void* sendbuff, const size_t sendcounts[], + const size_t sdispls[], void* recvbuff, + const size_t recvcounts[], const size_t rdispls[], + ncclDataType_t datatype, ncclComm_t comm, + hipStream_t stream); + /// @endcond + + /* + * Group semantics + * + * When managing multiple GPUs from a single thread, and since NCCL collective + * calls may perform inter-CPU synchronization, we need to "group" calls for + * different ranks/devices into a single call. + * + * Grouping NCCL calls as being part of the same collective operation is done + * using ncclGroupStart and ncclGroupEnd. ncclGroupStart will enqueue all + * collective calls until the ncclGroupEnd call, which will wait for all calls + * to be complete. Note that for collective communication, ncclGroupEnd only + * guarantees that the operations are enqueued on the streams, not that + * the operation is effectively done. + * + * Both collective communication and ncclCommInitRank can be used in conjunction + * of ncclGroupStart/ncclGroupEnd, but not together. + * + * Group semantics also allow to fuse multiple operations on the same device + * to improve performance (for aggregated collective calls), or to permit + * concurrent progress of multiple send/receive operations. + */ + + /*! @brief Group Start + * + * Start a group call. All calls to NCCL until ncclGroupEnd will be fused into + * a single NCCL operation. Nothing will be started on the CUDA stream until + * ncclGroupEnd. + */ + ncclResult_t ncclGroupStart(); + /// @cond include_hidden + ncclResult_t pncclGroupStart(); + /// @endcond + + /*! @brief Group End + * + * End a group call. Start a fused NCCL operation consisting of all calls since + * ncclGroupStart. Operations on the CUDA stream depending on the NCCL operations + * need to be called after ncclGroupEnd. + */ + ncclResult_t ncclGroupEnd(); + /// @cond include_hidden + ncclResult_t pncclGroupEnd(); + /// @endcond + +#ifdef __cplusplus +} // end extern "C" +#endif + +#endif // end include guard diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 5850c0cfb..985d404e2 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -12,6 +12,11 @@ if(NOT DEFINED NUM_PROCS) set(NUM_PROCS 2) endif() +math(EXPR NUM_SAMPLING_PROCS "${NUM_PROCS_REAL}-1") +if(NUM_SAMPLING_PROCS GREATER 3) + set(NUM_SAMPLING_PROCS 3) +endif() + math(EXPR NUM_THREADS "${NUM_PROCS_REAL} + (${NUM_PROCS_REAL} / 2)") if(NUM_THREADS GREATER 12) set(NUM_THREADS 12) @@ -30,8 +35,8 @@ set(_test_openmp_env "OMP_PROC_BIND=spread" "OMP_PLACES=threads" "OMP_NUM_THREAD set(_base_environment "OMNITRACE_USE_PERFETTO=ON" "OMNITRACE_USE_TIMEMORY=ON" "OMNITRACE_USE_SAMPLING=ON" - "OMNITRACE_USE_PROCESS_SAMPLING=ON" "OMNITRACE_TIME_OUTPUT=OFF" "${_test_openmp_env}" - "${_test_library_path}") + "OMNITRACE_USE_PROCESS_SAMPLING=ON" "OMNITRACE_TIME_OUTPUT=OFF" + "OMNITRACE_FILE_OUTPUT=ON" "${_test_openmp_env}" "${_test_library_path}") set(_flat_environment "OMNITRACE_USE_PERFETTO=ON" @@ -43,11 +48,8 @@ set(_flat_environment "OMNITRACE_COLLAPSE_PROCESSES=ON" "OMNITRACE_COLLAPSE_THREADS=ON" "OMNITRACE_TIMEMORY_COMPONENTS=wall_clock,trip_count" - "OMP_PROC_BIND=spread" - "OMP_PLACES=threads" - "OMP_NUM_THREADS=2" - "LD_LIBRARY_PATH=${PROJECT_BINARY_DIR}/${CMAKE_INSTALL_LIBDIR}:${OMNITRACE_DYNINST_API_RT_DIR}:$ENV{LD_LIBRARY_PATH}" - ) + "${_test_openmp_env}" + "${_test_library_path}") set(_lock_environment "OMNITRACE_USE_SAMPLING=OFF" @@ -62,18 +64,34 @@ set(_lock_environment "${_test_library_path}") set(_ompt_environment - "OMNITRACE_USE_PERFETTO=ON" "OMNITRACE_USE_TIMEMORY=ON" "OMNITRACE_TIME_OUTPUT=OFF" - "OMNITRACE_USE_OMPT=ON" "OMNITRACE_CRITICAL_TRACE=OFF" "${_test_openmp_env}" + "OMNITRACE_USE_PERFETTO=ON" + "OMNITRACE_USE_TIMEMORY=ON" + "OMNITRACE_TIME_OUTPUT=OFF" + "OMNITRACE_USE_OMPT=ON" + "OMNITRACE_CRITICAL_TRACE=OFF" + "OMNITRACE_TIMEMORY_COMPONENTS=wall_clock,trip_count,peak_rss" + "${_test_openmp_env}" "${_test_library_path}") set(_perfetto_environment - "OMNITRACE_USE_PERFETTO=ON" "OMNITRACE_USE_TIMEMORY=OFF" "OMNITRACE_USE_SAMPLING=ON" - "OMNITRACE_USE_PROCESS_SAMPLING=ON" "OMNITRACE_TIME_OUTPUT=OFF" "${_test_openmp_env}" + "OMNITRACE_USE_PERFETTO=ON" + "OMNITRACE_USE_TIMEMORY=OFF" + "OMNITRACE_USE_SAMPLING=ON" + "OMNITRACE_USE_PROCESS_SAMPLING=ON" + "OMNITRACE_TIME_OUTPUT=OFF" + "OMNITRACE_PERFETTO_BACKEND=inprocess" + "OMNITRACE_PERFETTO_FILL_POLICY=ring_buffer" + "${_test_openmp_env}" "${_test_library_path}") set(_timemory_environment - "OMNITRACE_USE_PERFETTO=OFF" "OMNITRACE_USE_TIMEMORY=ON" "OMNITRACE_USE_SAMPLING=ON" - "OMNITRACE_USE_PROCESS_SAMPLING=ON" "OMNITRACE_TIME_OUTPUT=OFF" "${_test_openmp_env}" + "OMNITRACE_USE_PERFETTO=OFF" + "OMNITRACE_USE_TIMEMORY=ON" + "OMNITRACE_USE_SAMPLING=ON" + "OMNITRACE_USE_PROCESS_SAMPLING=ON" + "OMNITRACE_TIME_OUTPUT=OFF" + "OMNITRACE_TIMEMORY_COMPONENTS=wall_clock,trip_count,peak_rss" + "${_test_openmp_env}" "${_test_library_path}") set(_test_environment ${_base_environment} "OMNITRACE_CRITICAL_TRACE=OFF") @@ -86,7 +104,7 @@ set(_python_environment "OMNITRACE_TIME_OUTPUT=OFF" "OMNITRACE_TREE_OUTPUT=OFF" "OMNITRACE_USE_PID=OFF" - "OMNITRACE_TIMEMORY_COMPONENTS=trip_count" + "OMNITRACE_TIMEMORY_COMPONENTS=wall_clock,trip_count" "${_test_library_path}" "PYTHONPATH=${PROJECT_BINARY_DIR}/lib/python/site-packages") @@ -100,6 +118,18 @@ set(_attach_environment "OMNITRACE_USE_KOKKOSP=ON" "OMNITRACE_TIME_OUTPUT=OFF" "OMNITRACE_USE_PID=OFF" + "OMNITRACE_TIMEMORY_COMPONENTS=wall_clock,trip_count" + "${_test_openmp_env}" + "${_test_library_path}") + +set(_rccl_environment + "OMNITRACE_USE_PERFETTO=ON" + "OMNITRACE_USE_TIMEMORY=ON" + "OMNITRACE_USE_SAMPLING=OFF" + "OMNITRACE_USE_PROCESS_SAMPLING=ON" + "OMNITRACE_USE_RCCLP=ON" + "OMNITRACE_TIME_OUTPUT=OFF" + "OMNITRACE_USE_PID=OFF" "${_test_openmp_env}" "${_test_library_path}") @@ -123,6 +153,83 @@ endif() # -------------------------------------------------------------------------------------- # +set(_VALID_GPU OFF) +if(OMNITRACE_USE_HIP) + set(_VALID_GPU ON) + find_program( + OMNITRACE_ROCM_SMI_EXE + NAMES rocm-smi + HINTS ${ROCmVersion_DIR} + PATHS ${ROCmVersion_DIR} + PATH_SUFFIXES bin) + if(OMNITRACE_ROCM_SMI_EXE) + execute_process( + COMMAND ${OMNITRACE_ROCM_SMI_EXE} + OUTPUT_VARIABLE _RSMI_OUT + ERROR_VARIABLE _RSMI_ERR + RESULT_VARIABLE _RSMI_RET) + if(_RSMI_RET EQUAL 0) + if("${_RSMI_OUTPUT}" MATCHES "ERROR" OR "${_RSMI_ERR}" MATCHES "ERROR") + set(_VALID_GPU OFF) + endif() + else() + set(_VALID_GPU OFF) + endif() + endif() + if(NOT _VALID_GPU) + omnitrace_message(AUTHOR_WARNING + "rocm-smi did not successfully run. Disabling GPU tests...") + endif() +endif() + +set(LULESH_USE_GPU ${LULESH_USE_HIP}) +if(LULESH_USE_CUDA) + set(LULESH_USE_GPU ON) +endif() + +# -------------------------------------------------------------------------------------- # + +function(OMNITRACE_WRITE_TEST_CONFIG _FILE _ENV) + set(_ENV_ONLY "OMNITRACE_(USE_MPIP|DEBUG_SETTINGS)=") + set(_FILE_CONTENTS) + set(_ENV_CONTENTS) + + foreach(_VAL ${${_ENV}}) + if("${_VAL}" MATCHES "^OMNITRACE_" AND NOT "${_VAL}" MATCHES "${_ENV_ONLY}") + set(_FILE_CONTENTS "${_FILE_CONTENTS}${_VAL}\n") + else() + list(APPEND _ENV_CONTENTS "${_VAL}") + endif() + endforeach() + + set(_CONFIG_FILE ${PROJECT_BINARY_DIR}/omnitrace-tests-config/${_FILE}) + file( + WRITE ${_CONFIG_FILE} + "# auto-generated by cmake + +# default values +OMNITRACE_CI = ON +OMNITRACE_VERBOSE = 1 +OMNITRACE_DL_VERBOSE = 1 +OMNITRACE_SAMPLING_FREQ = 50 +OMNITRACE_SAMPLING_DELAY = 0.05 +OMNITRACE_SAMPLING_CPUS = 0-${NUM_SAMPLING_PROCS} +OMNITRACE_SAMPLING_GPUS = $env:HIP_VISIBLE_DEVICES +OMNITRACE_ROCTRACER_HSA_API = ON +OMNITRACE_ROCTRACER_HSA_ACTIVITY = ON + +# test-specific values +${_FILE_CONTENTS} +") + list(APPEND _ENV_CONTENTS "OMNITRACE_CONFIG_FILE=${_CONFIG_FILE}") + list(APPEND _ENV_CONTENTS "OMNITRACE_DEBUG_SETTINGS=1") + set(${_ENV} + "${_ENV_CONTENTS}" + PARENT_SCOPE) +endfunction() + +# -------------------------------------------------------------------------------------- # + function(OMNITRACE_ADD_TEST) foreach(_PREFIX RUNTIME REWRITE REWRITE_RUN) foreach(_TYPE PASS FAIL SKIP) @@ -135,10 +242,17 @@ function(OMNITRACE_ADD_TEST) cmake_parse_arguments( TEST "SKIP_BASELINE;SKIP_REWRITE;SKIP_RUNTIME;SKIP_SAMPLING" # options - "NAME;TARGET;MPI;NUM_PROCS;REWRITE_TIMEOUT;RUNTIME_TIMEOUT" # single value args + "NAME;TARGET;MPI;GPU;NUM_PROCS;REWRITE_TIMEOUT;RUNTIME_TIMEOUT" # single value + # args "${_KWARGS}" # multiple value args ${ARGN}) + if(TEST_GPU AND NOT _VALID_GPU) + omnitrace_message(STATUS + "${TEST_NAME} requires a GPU and no valid GPUs were found") + return() + endif() + if("${TEST_MPI}" STREQUAL "") set(TEST_MPI OFF) endif() @@ -189,7 +303,7 @@ function(OMNITRACE_ADD_TEST) add_test( NAME ${TEST_NAME}-baseline COMMAND ${COMMAND_PREFIX} $ ${TEST_RUN_ARGS} - WORKING_DIRECTORY $) + WORKING_DIRECTORY ${PROJECT_BINARY_DIR}) endif() if(NOT TEST_SKIP_REWRITE) @@ -199,7 +313,7 @@ function(OMNITRACE_ADD_TEST) $ -o $/${TEST_NAME}.inst ${TEST_REWRITE_ARGS} -- $ - WORKING_DIRECTORY $) + WORKING_DIRECTORY ${PROJECT_BINARY_DIR}) if(NOT TEST_SKIP_SAMPLING) add_test( @@ -208,7 +322,7 @@ function(OMNITRACE_ADD_TEST) $ -o $/${TEST_NAME}.samp -M sampling ${TEST_REWRITE_ARGS} -- $ - WORKING_DIRECTORY $) + WORKING_DIRECTORY ${PROJECT_BINARY_DIR}) endif() add_test( @@ -216,7 +330,7 @@ function(OMNITRACE_ADD_TEST) COMMAND ${COMMAND_PREFIX} $/${TEST_NAME}.inst ${TEST_RUN_ARGS} - WORKING_DIRECTORY $) + WORKING_DIRECTORY ${PROJECT_BINARY_DIR}) if(NOT TEST_SKIP_SAMPLING) add_test( @@ -225,7 +339,7 @@ function(OMNITRACE_ADD_TEST) ${COMMAND_PREFIX} $/${TEST_NAME}.samp ${TEST_RUN_ARGS} - WORKING_DIRECTORY $) + WORKING_DIRECTORY ${PROJECT_BINARY_DIR}) endif() endif() @@ -234,14 +348,14 @@ function(OMNITRACE_ADD_TEST) NAME ${TEST_NAME}-runtime-instrument COMMAND $ ${TEST_RUNTIME_ARGS} -- $ ${TEST_RUN_ARGS} - WORKING_DIRECTORY $) + WORKING_DIRECTORY ${PROJECT_BINARY_DIR}) if(NOT TEST_SKIP_SAMPLING) add_test( NAME ${TEST_NAME}-runtime-instrument-sampling COMMAND $ -M sampling ${TEST_RUNTIME_ARGS} -- $ ${TEST_RUN_ARGS} - WORKING_DIRECTORY $) + WORKING_DIRECTORY ${PROJECT_BINARY_DIR}) endif() endif() @@ -303,6 +417,7 @@ function(OMNITRACE_ADD_TEST) endforeach() if(TEST ${TEST_NAME}-${_TEST}) + omnitrace_write_test_config(${TEST_NAME}-${_TEST}.cfg _environ) set_tests_properties( ${TEST_NAME}-${_TEST} PROPERTIES ENVIRONMENT @@ -431,6 +546,7 @@ endfunction() # general config file tests # # -------------------------------------------------------------------------------------- # + file( WRITE ${CMAKE_CURRENT_BINARY_DIR}/invalid.cfg " @@ -476,6 +592,7 @@ omnitrace_add_test( NAME transpose TARGET transpose MPI ${TRANSPOSE_USE_MPI} + GPU ON NUM_PROCS ${NUM_PROCS} REWRITE_ARGS -e -v 2 --print-instructions -E uniform_int_distribution RUNTIME_ARGS @@ -497,6 +614,7 @@ omnitrace_add_test( TARGET transpose LABELS "loops" MPI ${TRANSPOSE_USE_MPI} + GPU ON NUM_PROCS ${NUM_PROCS} REWRITE_ARGS -e @@ -577,78 +695,82 @@ omnitrace_add_test( ENVIRONMENT "${_base_environment};OMNITRACE_CRITICAL_TRACE=OFF" REWRITE_FAIL_REGEX "0 instrumented loops in procedure") -omnitrace_add_test( - SKIP_RUNTIME - NAME "mpi" - TARGET mpi-example - MPI ON - NUM_PROCS 4 - REWRITE_ARGS - -e - -v - 2 - --label - file - line - return - args - --min-instructions - 0 - ENVIRONMENT "${_base_environment};GOTCHA_DEBUG=1" - REWRITE_RUN_PASS_REGEX - "(/[A-Za-z-]+/perfetto-trace-0.proto).*(/[A-Za-z-]+/wall_clock-0.txt')" - REWRITE_RUN_FAIL_REGEX "-[0-9][0-9]+.(json|txt|proto)") - -omnitrace_add_test( - SKIP_RUNTIME SKIP_SAMPLING - NAME "mpi-flat-mpip" - TARGET mpi-example - MPI ON - NUM_PROCS 4 - LABELS "mpip" - REWRITE_ARGS - -e - -v - 2 - --label - file - line - return - args - --min-instructions - 0 - ENVIRONMENT "${_flat_environment};OMNITRACE_USE_SAMPLING=OFF;OMNITRACE_USE_MPIP=ON" - REWRITE_RUN_PASS_REGEX - ">>> main(.*\n.*)>>> MPI_Init_thread(.*\n.*)>>> MPI_Comm_size(.*\n.*)>>> MPI_Comm_rank(.*\n.*)>>> MPI_Barrier(.*\n.*)>>> MPI_Alltoall(.*\n.*)>>> pthread_join" - ) +if(OMNITRACE_USE_MPI OR OMNITRACE_USE_MPI_HEADERS) + omnitrace_add_test( + SKIP_RUNTIME + NAME "mpi" + TARGET mpi-example + MPI ON + NUM_PROCS 4 + REWRITE_ARGS + -e + -v + 2 + --label + file + line + return + args + --min-instructions + 0 + ENVIRONMENT "${_base_environment};GOTCHA_DEBUG=1" + REWRITE_RUN_PASS_REGEX + "(/[A-Za-z-]+/perfetto-trace-0.proto).*(/[A-Za-z-]+/wall_clock-0.txt')" + REWRITE_RUN_FAIL_REGEX "-[0-9][0-9]+.(json|txt|proto)") + + omnitrace_add_test( + SKIP_RUNTIME SKIP_SAMPLING + NAME "mpi-flat-mpip" + TARGET mpi-example + MPI ON + NUM_PROCS 4 + LABELS "mpip" + REWRITE_ARGS + -e + -v + 2 + --label + file + line + return + args + --min-instructions + 0 + ENVIRONMENT + "${_flat_environment};OMNITRACE_USE_SAMPLING=OFF;OMNITRACE_STRICT_CONFIG=OFF;OMNITRACE_USE_MPIP=ON" + REWRITE_RUN_PASS_REGEX + ">>> main(.*\n.*)>>> MPI_Init_thread(.*\n.*)>>> MPI_Comm_size(.*\n.*)>>> MPI_Comm_rank(.*\n.*)>>> MPI_Barrier(.*\n.*)>>> MPI_Alltoall(.*\n.*)>>> pthread_join" + ) -omnitrace_add_test( - SKIP_RUNTIME SKIP_SAMPLING - NAME "mpi-flat" - TARGET mpi-example - MPI ON - NUM_PROCS 4 - LABELS "mpip" - REWRITE_ARGS - -e - -v - 2 - --label - file - line - return - args - --min-instructions - 0 - ENVIRONMENT "${_flat_environment};OMNITRACE_USE_SAMPLING=OFF" - REWRITE_RUN_PASS_REGEX - ">>> main(.*\n.*)>>> MPI_Init_thread(.*\n.*)>>> MPI_Comm_size(.*\n.*)>>> MPI_Comm_rank(.*\n.*)>>> MPI_Barrier(.*\n.*)>>> MPI_Alltoall(.*\n.*)>>> pthread_join" - ) + omnitrace_add_test( + SKIP_RUNTIME SKIP_SAMPLING + NAME "mpi-flat" + TARGET mpi-example + MPI ON + NUM_PROCS 4 + LABELS "mpip" + REWRITE_ARGS + -e + -v + 2 + --label + file + line + return + args + --min-instructions + 0 + ENVIRONMENT "${_flat_environment};OMNITRACE_USE_SAMPLING=OFF" + REWRITE_RUN_PASS_REGEX + ">>> main(.*\n.*)>>> MPI_Init_thread(.*\n.*)>>> MPI_Comm_size(.*\n.*)>>> MPI_Comm_rank(.*\n.*)>>> MPI_Barrier(.*\n.*)>>> MPI_Alltoall(.*\n.*)>>> pthread_join" + ) +endif() omnitrace_add_test( NAME lulesh TARGET lulesh MPI ${LULESH_USE_MPI} + GPU ${LULESH_USE_GPU} NUM_PROCS 8 LABELS "kokkos" REWRITE_ARGS -e -v 2 --label file line return args @@ -674,6 +796,7 @@ omnitrace_add_test( NAME lulesh-baseline-kokkosp-libomnitrace TARGET lulesh MPI ${LULESH_USE_MPI} + GPU ${LULESH_USE_GPU} NUM_PROCS 8 LABELS "kokkos;kokkos-profile-library" RUN_ARGS -i 10 -s 20 -p @@ -686,6 +809,7 @@ omnitrace_add_test( NAME lulesh-baseline-kokkosp-libomnitrace-dl TARGET lulesh MPI ${LULESH_USE_MPI} + GPU ${LULESH_USE_GPU} NUM_PROCS 8 LABELS "kokkos;kokkos-profile-library" RUN_ARGS -i 10 -s 20 -p @@ -698,6 +822,7 @@ omnitrace_add_test( NAME lulesh-kokkosp TARGET lulesh MPI ${LULESH_USE_MPI} + GPU ${LULESH_USE_GPU} NUM_PROCS 8 LABELS "kokkos" REWRITE_ARGS -e -v 2 @@ -721,6 +846,7 @@ omnitrace_add_test( NAME lulesh-perfetto TARGET lulesh MPI ${LULESH_USE_MPI} + GPU ${LULESH_USE_GPU} NUM_PROCS 8 LABELS "kokkos;loops" REWRITE_ARGS -e -v 2 @@ -743,6 +869,7 @@ omnitrace_add_test( NAME lulesh-timemory TARGET lulesh MPI ${LULESH_USE_MPI} + GPU ${LULESH_USE_GPU} NUM_PROCS 8 LABELS "kokkos;loops" REWRITE_ARGS -e -v 2 -l --dynamic-callsites --traps --allow-overlapping @@ -965,6 +1092,64 @@ if(TARGET parallel-overhead AND _VALID_PTRACE_SCOPE) "Dyninst was unable to attach to the specified process") endif() +# -------------------------------------------------------------------------------------- # +# +# rccl tests +# +# -------------------------------------------------------------------------------------- # + +foreach(_TARGET ${RCCL_TEST_TARGETS}) + string(REPLACE "rccl-tests::" "" _NAME "${_TARGET}") + string(REPLACE "_" "-" _NAME "${_NAME}") + omnitrace_add_test( + SKIP_SAMPLING + NAME rccl-test-${_NAME} + TARGET ${_TARGET} + LABELS "rccl-tests;rcclp" + MPI ON + GPU ON + NUM_PROCS 1 + REWRITE_ARGS + -e + -v + 2 + -i + 8 + --label + file + line + return + args + RUNTIME_ARGS + -e + -v + 1 + -i + 8 + --label + file + line + return + args + RUN_ARGS -t + 1 + -g + 1 + -i + 10 + -w + 2 + -m + 2 + -p + -c + 1 + -z + -s + 1 + ENVIRONMENT "${_rccl_environment}") +endforeach() + # -------------------------------------------------------------------------------------- # # # python tests