Skip to content

Commit

Permalink
Remove reliance on MPI_Comm_rank (#56)
Browse files Browse the repository at this point in the history
* Remove reliance on MPI_Comm_rank

- read /proc/<PID>/tasks/<PID>/children of parent process to deduce the rank
- Old format relied on user calling MPI_Comm_rank(MPI_COMM_WORLD, ...)
- if MPI_Comm_rank called with subcommunicators only, multiple ranks would write to same file

* Tweak mpi example
  • Loading branch information
jrmadsen authored Jun 20, 2022
1 parent f27f062 commit 8eff363
Show file tree
Hide file tree
Showing 5 changed files with 171 additions and 16 deletions.
25 changes: 19 additions & 6 deletions examples/mpi/mpi.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/

#include <mpi.h>

#include <cfloat>
#include <chrono>
#include <cmath>
Expand All @@ -30,15 +32,12 @@ THE SOFTWARE.
#include <iostream>
#include <mutex>
#include <random>
#include <sstream>
#include <thread>
#include <type_traits>
#include <unistd.h>
#include <vector>

static std::mutex print_lock{};
using auto_lock_t = std::unique_lock<std::mutex>;

#include <mpi.h>

std::string _name = {};

template <typename Tp, size_t N>
Expand Down Expand Up @@ -105,10 +104,24 @@ main(int argc, char** argv)

printf("[%s] Number of iterations: %i\n", _name.c_str(), nitr);

MPI_Init(&argc, &argv);
int _mpi_thread_provided;
MPI_Init_thread(&argc, &argv, MPI_THREAD_SINGLE, &_mpi_thread_provided);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
MPI_Comm_size(MPI_COMM_WORLD, &size);

auto _ppid = getppid();
std::ifstream _ifs{ "/proc/" + std::to_string(_ppid) + "/task/" +
std::to_string(_ppid) + "/children" };
std::stringstream _ss{};
while(_ifs)
{
std::string _s{};
_ifs >> _s;
_ss << _s << " ";
}
printf("[%s] RANK = %i, PID = %i, PPID = %i :: %s\n", _name.c_str(), rank, getpid(),
getppid(), _ss.str().c_str());

MPI_Barrier(MPI_COMM_WORLD);
for(int i = 0; i < nitr; ++i)
{
Expand Down
2 changes: 2 additions & 0 deletions source/lib/omnitrace/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@ set(library_sources
${CMAKE_CURRENT_LIST_DIR}/library/dynamic_library.cpp
${CMAKE_CURRENT_LIST_DIR}/library/kokkosp.cpp
${CMAKE_CURRENT_LIST_DIR}/library/gpu.cpp
${CMAKE_CURRENT_LIST_DIR}/library/mproc.cpp
${CMAKE_CURRENT_LIST_DIR}/library/ompt.cpp
${CMAKE_CURRENT_LIST_DIR}/library/perfetto.cpp
${CMAKE_CURRENT_LIST_DIR}/library/ptl.cpp
Expand Down Expand Up @@ -90,6 +91,7 @@ set(library_headers
${CMAKE_CURRENT_LIST_DIR}/library/debug.hpp
${CMAKE_CURRENT_LIST_DIR}/library/dynamic_library.hpp
${CMAKE_CURRENT_LIST_DIR}/library/gpu.hpp
${CMAKE_CURRENT_LIST_DIR}/library/mproc.hpp
${CMAKE_CURRENT_LIST_DIR}/library/ompt.hpp
${CMAKE_CURRENT_LIST_DIR}/library/perfetto.hpp
${CMAKE_CURRENT_LIST_DIR}/library/ptl.hpp
Expand Down
47 changes: 37 additions & 10 deletions source/lib/omnitrace/library/components/mpi_gotcha.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
#include "library/components/omnitrace.hpp"
#include "library/config.hpp"
#include "library/debug.hpp"
#include "library/mproc.hpp"

#include <thread>
#include <timemory/backends/mpi.hpp>
Expand Down Expand Up @@ -170,19 +171,42 @@ mpi_gotcha::audit(const gotcha_data_t& _data, audit::outgoing, int _retval)
comp::activate_mpip<tim::component_tuple<omnitrace::component::omnitrace>,
api::omnitrace>();
}

auto _size = mproc::get_concurrent_processes().size();
if(_size > 0)
{
m_size = _size;
tim::mpi::set_size(_size);
OMNITRACE_BASIC_VERBOSE(0, "[pid=%i] MPI size: %i (%i)\n", process::get_id(),
tim::mpi::size(), m_size);

auto _rank = mproc::get_process_index();
if(_rank >= 0)
{
m_rank = _rank;
tim::mpi::set_rank(_rank);
tim::settings::default_process_suffix() = _rank;
get_perfetto_output_filename().clear();
OMNITRACE_BASIC_VERBOSE(0, "[pid=%i] MPI rank: %i (%i)\n",
process::get_id(), tim::mpi::rank(), m_rank);
}
}
}
else if(_retval == tim::mpi::success_v && _data.tool_id.find("MPI_Comm_") == 0)
{
if(_data.tool_id == "MPI_Comm_rank")
{
if(m_rank_ptr)
{
m_rank = std::max<int>(*m_rank_ptr, m_rank);
tim::mpi::set_rank(m_rank);
tim::settings::default_process_suffix() = m_rank;
get_perfetto_output_filename().clear();
OMNITRACE_BASIC_VERBOSE(0, "[pid=%i] MPI rank: %i (%i)\n",
process::get_id(), tim::mpi::rank(), m_rank);
if(mproc::get_concurrent_processes().empty())
{
m_rank = std::max<int>(*m_rank_ptr, m_rank);
tim::mpi::set_rank(m_rank);
tim::settings::default_process_suffix() = m_rank;
get_perfetto_output_filename().clear();
OMNITRACE_BASIC_VERBOSE(0, "[pid=%i] MPI rank: %i (%i)\n",
process::get_id(), tim::mpi::rank(), m_rank);
}
}
else
{
Expand All @@ -194,10 +218,13 @@ mpi_gotcha::audit(const gotcha_data_t& _data, audit::outgoing, int _retval)
{
if(m_size_ptr)
{
m_size = std::max<int>(*m_size_ptr, m_size);
tim::mpi::set_size(m_size);
OMNITRACE_BASIC_VERBOSE(0, "[pid=%i] MPI size: %i (%i)\n",
process::get_id(), tim::mpi::size(), m_size);
if(mproc::get_concurrent_processes().empty())
{
m_size = std::max<int>(*m_size_ptr, m_size);
tim::mpi::set_size(m_size);
OMNITRACE_BASIC_VERBOSE(0, "[pid=%i] MPI size: %i (%i)\n",
process::get_id(), tim::mpi::size(), m_size);
}
}
else
{
Expand Down
74 changes: 74 additions & 0 deletions source/lib/omnitrace/library/mproc.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
// MIT License
//
// Copyright (c) 2022 Advanced Micro Devices, Inc. All Rights Reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in all
// copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.

#include "library/mproc.hpp"
#include "library/debug.hpp"
#include "library/timemory.hpp"

#include <fstream>
#include <set>
#include <sstream>
#include <string>
#include <unistd.h>

namespace omnitrace
{
namespace mproc
{
std::set<int>
get_concurrent_processes(int _ppid)
{
std::set<int> _children = {};
if(_ppid > 0)
{
auto _inp = JOIN('/', "/proc", _ppid, "task", _ppid, "children");
std::ifstream _ifs{ _inp };
if(!_ifs)
{
OMNITRACE_VERBOSE_F(0, "Warning! File '%s' cannot be read\n", _inp.c_str());
return _children;
}

while(_ifs)
{
int _v = -1;
_ifs >> _v;
if(!_ifs.good() || _ifs.eof()) break;
if(_v < 0) continue;
_children.emplace(_v);
}
}
return _children;
}

int
get_process_index(int _pid, int _ppid)
{
auto _children = get_concurrent_processes(_ppid);
for(auto itr = _children.begin(); itr != _children.end(); ++itr)
{
if(*itr == _pid) return std::distance(_children.begin(), itr);
}
return -1;
}
} // namespace mproc
} // namespace omnitrace
39 changes: 39 additions & 0 deletions source/lib/omnitrace/library/mproc.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
// MIT License
//
// Copyright (c) 2022 Advanced Micro Devices, Inc. All Rights Reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in all
// copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.

#pragma once

#include <set>
#include <unistd.h>

namespace omnitrace
{
namespace mproc
{
// get the concurrent processes from /proc/<PPID>/task/<PPID>/children
std::set<int>
get_concurrent_processes(int _ppid = getppid());

int
get_process_index(int _pid = getpid(), int _ppid = getppid());
} // namespace mproc
} // namespace omnitrace

0 comments on commit 8eff363

Please sign in to comment.