Skip to content

Commit

Permalink
Initial checkpointing support (#1075)
Browse files Browse the repository at this point in the history
Initial checkpointing support.

- Currently only supports serial runs
- Updates to serialization to support checkpointing
  - Added ability to turn on pointer tracking
  - Added ability to serialize non-pointer variables, but track address
    of data as a pointer.
- Changed templating for handlers to support checkpointing
  - Function pointer is now a non-type template parameter
- Added additional serialization tests and new checkpointing tests
- Added a function to ConfigBase so you could query whether or not an
  option was set on the command line.

Other incidental changes:

- Remove old references to SharedRegion
- Removed type checking in testing comparison filters.  This was done
  because the filter is called for every line of output and the type
  checks apparently aren't very efficient and were a performance bottleneck
  for tests with a lot of output
- Added Python Module output to sst-info
- Added fflush to all output in TraceFunction

---------

Co-authored-by: Gwen <grvosku@sandia.gov>
  • Loading branch information
feldergast and gvoskuilen authored May 10, 2024
1 parent ce168a5 commit 38f6b71
Show file tree
Hide file tree
Showing 142 changed files with 106,834 additions and 885 deletions.
2 changes: 2 additions & 0 deletions .clang-format
Original file line number Diff line number Diff line change
Expand Up @@ -199,5 +199,7 @@ WhitespaceSensitiveMacros:
- BOOST_PP_STRINGIZE
- NS_SWIFT_NAME
- CF_SWIFT_NAME
- SST_SER
- SST_SER_AS_PTR
...

7 changes: 6 additions & 1 deletion src/sst/core/Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ nobase_dist_sst_HEADERS = \
activity.h \
clock.h \
baseComponent.h \
checkpointAction.h \
component.h \
componentExtension.h \
componentInfo.h \
Expand Down Expand Up @@ -87,16 +88,19 @@ nobase_dist_sst_HEADERS = \
eli/simpleInfo.h \
eli/statsInfo.h \
eli/subcompSlotInfo.h \
serialization/ser_ptr_type.h \
serialization/serializable.h \
serialization/serializable_fwd.h \
serialization/serialize.h \
serialization/serialize_impl_fwd.h \
serialization/serialize_array.h \
serialization/serialize_atomic.h \
serialization/serialize_buffer_accessor.h \
serialization/serialize_deque.h \
serialization/serialize_list.h \
serialization/serialize_map.h \
serialization/serialize_multiset.h \
serialization/serialize_packer.h \
serialization/serialize_priority_queue.h \
serialization/serialize_serializable.h \
serialization/serialize_set.h \
serialization/serialize_sizer.h \
Expand Down Expand Up @@ -160,6 +164,7 @@ sst_core_sources = \
action.cc \
clock.cc \
baseComponent.cc \
checkpointAction.cc \
component.cc \
componentExtension.cc \
componentInfo.cc \
Expand Down
6 changes: 6 additions & 0 deletions src/sst/core/action.cc
Original file line number Diff line number Diff line change
Expand Up @@ -29,4 +29,10 @@ Action::endSimulation(SimTime_t end)
Simulation_impl::getSimulation()->endSimulation(end);
}

void
Action::serialize_order(SST::Core::Serialization::serializer& ser)
{
SST::Activity::serialize_order(ser);
}

} // namespace SST
4 changes: 3 additions & 1 deletion src/sst/core/action.h
Original file line number Diff line number Diff line change
Expand Up @@ -32,12 +32,14 @@ class Action : public Activity
protected:
/** Called to signal to the Simulation object to end the simulation */
void endSimulation();

/** Called to signal to the Simulation object to end the simulation
* @param end Simulation cycle when the simulation finishes
*/
void endSimulation(SimTime_t end);

NotSerializable(SST::Action)
void serialize_order(SST::Core::Serialization::serializer& ser) override;
ImplementVirtualSerializable(SST::Action)
};

} // namespace SST
Expand Down
101 changes: 66 additions & 35 deletions src/sst/core/baseComponent.cc
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,10 @@

namespace SST {

BaseComponent::BaseComponent() : SST::Core::Serialization::serializable() {}

BaseComponent::BaseComponent(ComponentId_t id) :
SST::Core::Serialization::serializable(),
my_info(Simulation_impl::getSimulation()->getComponentInfo(id)),
sim_(Simulation_impl::getSimulation()),
isExtension(false)
Expand Down Expand Up @@ -106,10 +109,19 @@ BaseComponent::pushValidParams(Params& params, const std::string& type)
params.pushAllowedKeys(keyset);
}

TimeConverter*
BaseComponent::registerClock(const std::string& freq, Clock::HandlerBase* handler, bool regAll)
void
BaseComponent::registerClock_impl(TimeConverter* tc, Clock::HandlerBase* handler, bool regAll)
{
TimeConverter* tc = sim_->registerClock(freq, handler, CLOCKPRIORITY);

// Need to see if I already know about this clock handler
bool found = false;
for ( auto* x : clock_handlers ) {
if ( handler == x ) {
found = true;
break;
}
}
if ( !found ) clock_handlers.push_back(handler);

// Check to see if there is a profile tool installed
auto tools = sim_->getProfileTool<Profile::ClockHandlerProfileTool>("clock");
Expand All @@ -126,52 +138,30 @@ BaseComponent::registerClock(const std::string& freq, Clock::HandlerBase* handle
setDefaultTimeBaseForLinks(tc);
my_info->defaultTimeBase = tc;
}
}


TimeConverter*
BaseComponent::registerClock(const std::string& freq, Clock::HandlerBase* handler, bool regAll)
{
TimeConverter* tc = sim_->registerClock(freq, handler, CLOCKPRIORITY);
registerClock_impl(tc, handler, regAll);
return tc;
}

TimeConverter*
BaseComponent::registerClock(const UnitAlgebra& freq, Clock::HandlerBase* handler, bool regAll)
{
TimeConverter* tc = sim_->registerClock(freq, handler, CLOCKPRIORITY);

// Check to see if there is a profile tool installed
auto tools = sim_->getProfileTool<Profile::ClockHandlerProfileTool>("clock");

for ( auto* tool : tools ) {
ClockHandlerMetaData mdata(my_info->getID(), getName(), getType());
// Add the receive profiler to the handler
handler->addProfileTool(tool, mdata);
}

// if regAll is true set tc as the default for the component and
// for all the links
if ( regAll ) {
setDefaultTimeBaseForLinks(tc);
my_info->defaultTimeBase = tc;
}
registerClock_impl(tc, handler, regAll);
return tc;
}

TimeConverter*
BaseComponent::registerClock(TimeConverter* tc, Clock::HandlerBase* handler, bool regAll)
{
TimeConverter* tcRet = sim_->registerClock(tc, handler, CLOCKPRIORITY);

// Check to see if there is a profile tool installed
auto tools = sim_->getProfileTool<Profile::ClockHandlerProfileTool>("clock");

for ( auto* tool : tools ) {
ClockHandlerMetaData mdata(my_info->getID(), getName(), getType());
// Add the receive profiler to the handler
handler->addProfileTool(tool, mdata);
}

// if regAll is true set tc as the default for the component and
// for all the links
if ( regAll ) {
setDefaultTimeBaseForLinks(tcRet);
my_info->defaultTimeBase = tcRet;
}
registerClock_impl(tcRet, handler, regAll);
return tcRet;
}

Expand Down Expand Up @@ -832,4 +822,45 @@ BaseComponent::getComponentProfileTools(const std::string& point)
return sim_->getProfileTool<Profile::ComponentProfileTool>(point);
}

void
BaseComponent::serialize_order(SST::Core::Serialization::serializer& ser)
{
ser& my_info;
ser& isExtension;

switch ( ser.mode() ) {
case SST::Core::Serialization::serializer::SIZER:
case SST::Core::Serialization::serializer::PACK:
{
// Need to serialize each handler
std::pair<Clock::HandlerBase*, SimTime_t> p;
size_t num_handlers = clock_handlers.size();
ser& num_handlers;
for ( auto* handler : clock_handlers ) {
p.first = handler;
// See if it's currently registered with a clock
p.second = sim_->getClockForHandler(handler);
ser& p;
}
break;
}
case SST::Core::Serialization::serializer::UNPACK:
{
sim_ = Simulation_impl::getSimulation();
std::pair<Clock::HandlerBase*, SimTime_t> p;
size_t num_handlers;
ser& num_handlers;
for ( size_t i = 0; i < num_handlers; ++i ) {
ser& p;
// Add handler to clock_handlers list
clock_handlers.push_back(p.first);
// If it was previously registered, register it now
if ( p.second ) { sim_->registerClock(p.second, p.first, CLOCKPRIORITY); }
}
break;
}
}
}


} // namespace SST
62 changes: 58 additions & 4 deletions src/sst/core/baseComponent.h
Original file line number Diff line number Diff line change
Expand Up @@ -48,9 +48,10 @@ class UnitAlgebra;
/**
* Main component object for the simulation.
*/
class BaseComponent
class BaseComponent : public SST::Core::Serialization::serializable
{

friend class Component;
friend class ComponentExtension;
friend class ComponentInfo;
friend class SubComponent;
Expand All @@ -61,6 +62,9 @@ class BaseComponent
BaseComponent*, Statistics::StatisticProcessingEngine*, const std::string& /*type*/,
const std::string& /*name*/, const std::string& /*subId*/, Params&)>;

// For serialization only
BaseComponent();

public:
BaseComponent(ComponentId_t id);
virtual ~BaseComponent();
Expand Down Expand Up @@ -353,6 +357,16 @@ class BaseComponent
}

private:
ImplementSerializable(SST::BaseComponent)
void serialize_order(SST::Core::Serialization::serializer& ser) override;


/**
Handles the profile points, default time base, handler tracking
and checkpointing.
*/
void registerClock_impl(TimeConverter* tc, Clock::HandlerBase* handler, bool regAll);

template <typename T>
Statistics::Statistic<T>*
createStatistic(SST::Params& params, StatisticId_t id, const std::string& name, const std::string& statSubId)
Expand Down Expand Up @@ -870,9 +884,13 @@ class BaseComponent
std::vector<Profile::ComponentProfileTool*> getComponentProfileTools(const std::string& point);

private:
ComponentInfo* my_info = nullptr;
Simulation_impl* sim_ = nullptr;
bool isExtension;
ComponentInfo* my_info = nullptr;
Simulation_impl* sim_ = nullptr;
bool isExtension = false;

// Need to track clock handlers for checkpointing. We need to
// know what clock handlers we have registered with the core
std::vector<Clock::HandlerBase*> clock_handlers;

void addSelfLink(const std::string& name);
Link* getLinkFromParentSharedPort(const std::string& port);
Expand Down Expand Up @@ -1077,6 +1095,42 @@ class SubComponentSlotInfo
}
};

namespace Core {
namespace Serialization {

template <class T>
class serialize_impl<Statistic<T>*>
{
template <class A>
friend class serialize;
void operator()(Statistic<T>*& s, serializer& ser)
{
// For sizer and pack, only need to get the information needed
// to create a NullStatistic on unpack.
switch ( ser.mode() ) {
case serializer::SIZER:
case serializer::PACK:
{
BaseComponent* comp = s->getComponent();
ser& comp;
break;
}
case serializer::UNPACK:
{
Params params;
BaseComponent* comp;
ser& comp;
s = Factory::getFactory()->CreateWithParams<Statistic<T>>(
"sst.NullStatistic", params, comp, "", "", params);
break;
}
}
}
};

} // namespace Serialization
} // namespace Core

} // namespace SST

#endif // SST_CORE_BASECOMPONENT_H
4 changes: 3 additions & 1 deletion src/sst/core/cfgoutput/jsonConfigOutput.cc
Original file line number Diff line number Diff line change
Expand Up @@ -154,12 +154,14 @@ JSONConfigGraphOutput::generate(const Config* cfg, ConfigGraph* graph)
outputJson["program_options"]["print-timing-info"] = cfg->print_timing() ? "true" : "false";
// Ignore stopAfter for now
// outputJson["program_options"]["stopAfter"] = cfg->stopAfterSec();
outputJson["program_options"]["heartbeat-period"] = cfg->heartbeatPeriod();
outputJson["program_options"]["heartbeat-period"] = cfg->heartbeat_period();
outputJson["program_options"]["timebase"] = cfg->timeBase();
outputJson["program_options"]["partitioner"] = cfg->partitioner();
outputJson["program_options"]["timeVortex"] = cfg->timeVortex();
outputJson["program_options"]["interthread-links"] = cfg->interthread_links() ? "true" : "false";
outputJson["program_options"]["output-prefix-core"] = cfg->output_core_prefix();
outputJson["program_options"]["checkpoint-period"] = cfg->checkpoint_period();


// Put in the global param sets
for ( const auto& set : getGlobalParamSetNames() ) {
Expand Down
4 changes: 3 additions & 1 deletion src/sst/core/cfgoutput/pythonConfigOutput.cc
Original file line number Diff line number Diff line change
Expand Up @@ -229,7 +229,7 @@ PythonConfigGraphOutput::generate(const Config* cfg, ConfigGraph* graph)
outputFile, "sst.setProgramOption(\"print-timing-info\", \"%s\")\n", cfg->print_timing() ? "true" : "false");
// Ignore stopAfter for now
// fprintf(outputFile, "sst.setProgramOption(\"stopAfter\", \"%" PRIu32 "\")\n", cfg->stopAfterSec);
fprintf(outputFile, "sst.setProgramOption(\"heartbeat-period\", \"%s\")\n", cfg->heartbeatPeriod().c_str());
fprintf(outputFile, "sst.setProgramOption(\"heartbeat-period\", \"%s\")\n", cfg->heartbeat_period().c_str());
fprintf(outputFile, "sst.setProgramOption(\"timebase\", \"%s\")\n", cfg->timeBase().c_str());
fprintf(outputFile, "sst.setProgramOption(\"partitioner\", \"%s\")\n", cfg->partitioner().c_str());
fprintf(outputFile, "sst.setProgramOption(\"timeVortex\", \"%s\")\n", cfg->timeVortex().c_str());
Expand All @@ -238,6 +238,8 @@ PythonConfigGraphOutput::generate(const Config* cfg, ConfigGraph* graph)
cfg->interthread_links() ? "true" : "false");
fprintf(outputFile, "sst.setProgramOption(\"output-prefix-core\", \"%s\")\n", cfg->output_core_prefix().c_str());

fprintf(outputFile, "sst.setProgramOption(\"checkpoint-period\", \"%s\")\n", cfg->checkpoint_period().c_str());

// Output the global params
fprintf(outputFile, "# Define the global parameter sets:\n");
std::vector<std::string> global_param_sets = getGlobalParamSetNames();
Expand Down
Loading

0 comments on commit 38f6b71

Please sign in to comment.