diff --git a/CMake/ClangCompilers.cmake b/CMake/ClangCompilers.cmake index d379712dc5..92841bc564 100644 --- a/CMake/ClangCompilers.cmake +++ b/CMake/ClangCompilers.cmake @@ -22,14 +22,14 @@ ENDIF(QMC_OMP) # Set clang specific flags (which we always want) ADD_DEFINITIONS( -Drestrict=__restrict__ ) -SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fomit-frame-pointer -fstrict-aliasing") -SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fomit-frame-pointer -fstrict-aliasing -D__forceinline=inline") +SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fstrict-aliasing") +SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fstrict-aliasing -D__forceinline=inline") # Set extra optimization specific flags -SET( CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} -ffast-math" ) -SET( CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -ffast-math" ) -SET( CMAKE_C_FLAGS_RELWITHDEBINFO "${CMAKE_C_FLAGS_RELWITHDEBINFO} -ffast-math" ) -SET( CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO} -ffast-math" ) +SET( CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} -fomit-frame-pointer -ffast-math" ) +SET( CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -fomit-frame-pointer -ffast-math" ) +SET( CMAKE_C_FLAGS_RELWITHDEBINFO "${CMAKE_C_FLAGS_RELWITHDEBINFO} -fomit-frame-pointer -ffast-math" ) +SET( CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO} -fomit-frame-pointer -ffast-math" ) # Set extra debug flags SET( CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} -fno-omit-frame-pointer -fstandalone-debug" ) diff --git a/CMake/GNUCompilers.cmake b/CMake/GNUCompilers.cmake index e0b2bff750..9fed49c20f 100755 --- a/CMake/GNUCompilers.cmake +++ b/CMake/GNUCompilers.cmake @@ -16,8 +16,11 @@ ENDIF(QMC_OMP) # Set gnu specific flags (which we always want) ADD_DEFINITIONS( -Drestrict=__restrict__ ) -SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fomit-frame-pointer -finline-limit=1000 -fstrict-aliasing -funroll-all-loops") -SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fomit-frame-pointer -finline-limit=1000 -fstrict-aliasing -funroll-all-loops -D__forceinline=inline") +SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -finline-limit=1000 -fstrict-aliasing -funroll-all-loops") +SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -finline-limit=1000 -fstrict-aliasing -funroll-all-loops -D__forceinline=inline") + +SET( CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} -fno-omit-frame-pointer" ) +SET( CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -fno-omit-frame-pointer" ) # Suppress compile warnings SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wno-deprecated") @@ -32,10 +35,10 @@ SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}" ) SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wcomment -Wmisleading-indentation -Wmaybe-uninitialized -Wuninitialized -Wreorder -Wno-unknown-pragmas -Wno-sign-compare") # Set extra optimization specific flags -SET( CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} -ffast-math" ) -SET( CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -ffast-math" ) -SET( CMAKE_C_FLAGS_RELWITHDEBINFO "${CMAKE_C_FLAGS_RELWITHDEBINFO} -ffast-math" ) -SET( CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO} -ffast-math" ) +SET( CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} -fomit-frame-pointer -ffast-math" ) +SET( CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -fomit-frame-pointer -ffast-math" ) +SET( CMAKE_C_FLAGS_RELWITHDEBINFO "${CMAKE_C_FLAGS_RELWITHDEBINFO} -fomit-frame-pointer -ffast-math" ) +SET( CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO} -fomit-frame-pointer -ffast-math" ) # Special architectural flags #-------------------------------------- diff --git a/src/MinimalContainers/ConstantSizeMatrix.hpp b/src/MinimalContainers/ConstantSizeMatrix.hpp index c6d2d3fdf4..dd28e7d54c 100644 --- a/src/MinimalContainers/ConstantSizeMatrix.hpp +++ b/src/MinimalContainers/ConstantSizeMatrix.hpp @@ -143,8 +143,6 @@ class ConstantSizeMatrix T* data() { return data_.data(); } const T* data() const { return data_.data(); } - size_t sizeofElement() { return sizeof(T); } - size_t capacity() { return n_max_ * m_max_; } size_t n_capacity() { return n_max_; } diff --git a/src/Particle/MCWalkerConfiguration.cpp b/src/Particle/MCWalkerConfiguration.cpp index 0b3e81d88c..63cc8bbb53 100644 --- a/src/Particle/MCWalkerConfiguration.cpp +++ b/src/Particle/MCWalkerConfiguration.cpp @@ -293,6 +293,15 @@ void MCWalkerConfiguration::copyWalkerRefs(Walker_t* head, Walker_t* tail) WalkerList[1] = tail; } +void MCWalkerConfiguration::fakeWalkerList(Walker_t* first, Walker_t* second) +{ + if (WalkerList.size() != 0) + throw std::runtime_error("This should only be called in tests and only with a fresh MCWC!"); + OwnWalkers = false; + WalkerList.push_back(first); + WalkerList.push_back(second); +} + /** Make Metropolis move to the walkers and save in a temporary array. * @param it the iterator of the first walker to work on * @param tauinv inverse of the time step diff --git a/src/Particle/MCWalkerConfiguration.h b/src/Particle/MCWalkerConfiguration.h index f623505387..cde8be7045 100644 --- a/src/Particle/MCWalkerConfiguration.h +++ b/src/Particle/MCWalkerConfiguration.h @@ -168,12 +168,17 @@ class MCWalkerConfiguration : public ParticleSet * @param head pointer to the head walker * @param tail pointer to the tail walker * + * \todo I believe this can/should be deleted * Special function introduced to work with Reptation method. * Clear the current WalkerList and add two walkers, head and tail. * OwnWalkers are set to false. */ void copyWalkerRefs(Walker_t* head, Walker_t* tail); + /** make fake walker list for testing + */ + void fakeWalkerList(Walker_t* first, Walker_t* second); + ///clean up the walker list and make a new list void resize(int numWalkers, int numPtcls); diff --git a/src/Particle/ParticleSet.cpp b/src/Particle/ParticleSet.cpp index 85298e36e8..08796a8f8c 100644 --- a/src/Particle/ParticleSet.cpp +++ b/src/Particle/ParticleSet.cpp @@ -52,8 +52,8 @@ ParticleSet::ParticleSet() ThreadID(0), activePtcl(-1), SK(0), - myTwist(0.0), Properties(0, 0, 1, WP::MAXPROPERTIES), + myTwist(0.0), ParentName("0"), TotalNum(0) { @@ -68,8 +68,8 @@ ParticleSet::ParticleSet(const ParticleSet& p) activePtcl(-1), mySpecies(p.getSpeciesSet()), SK(0), - myTwist(0.0), Properties(p.Properties), + myTwist(0.0), ParentName(p.parentName()) { set_quantum_domain(p.quantum_domain); diff --git a/src/Particle/Walker.h b/src/Particle/Walker.h index 767587e8a2..9945957106 100644 --- a/src/Particle/Walker.h +++ b/src/Particle/Walker.h @@ -430,7 +430,7 @@ struct Walker #endif //Don't add the nLocal but the actual allocated size. We want to register once for the life of a //walker so we leave space for additional properties. - DataSet.add(Properties.data(), Properties.data() + Properties.sizeofElement() * Properties.capacity()); + DataSet.add(Properties.data(), Properties.data() + Properties.capacity()); //DataSet.add(Properties.first_address(), Properties.last_address()); // \todo likely to be broken if the Properties change above is needed. @@ -463,7 +463,7 @@ struct Walker DataSet.get(G.first_address(), G.last_address()); DataSet.get(L.first_address(), L.last_address()); #endif - DataSet.get(Properties.data(), Properties.data() + Properties.sizeofElement() * Properties.capacity()); + DataSet.get(Properties.data(), Properties.data() + Properties.capacity()); for (int iat = 0; iat < PropertyHistory.size(); iat++) DataSet.get(PropertyHistory[iat].data(), PropertyHistory[iat].data() + PropertyHistory[iat].size()); DataSet.get(PHindex.data(), PHindex.data() + PHindex.size()); @@ -512,7 +512,7 @@ struct Walker DataSet.put(G.first_address(), G.last_address()); DataSet.put(L.first_address(), L.last_address()); #endif - DataSet.put(Properties.data(), Properties.data() + Properties.sizeofElement() * Properties.capacity()); + DataSet.put(Properties.data(), Properties.data() + Properties.capacity()); for (int iat = 0; iat < PropertyHistory.size(); iat++) DataSet.put(PropertyHistory[iat].data(), PropertyHistory[iat].data() + PropertyHistory[iat].size()); DataSet.put(PHindex.data(), PHindex.data() + PHindex.size()); diff --git a/src/Particle/tests/test_walker.cpp b/src/Particle/tests/test_walker.cpp index f1f4a833e6..d89f2194cb 100644 --- a/src/Particle/tests/test_walker.cpp +++ b/src/Particle/tests/test_walker.cpp @@ -2,15 +2,18 @@ // This file is distributed under the University of Illinois/NCSA Open Source License. // See LICENSE file in top directory for details. // -// Copyright (c) 2016 Jeongnim Kim and QMCPACK developers. +// Copyright (c) 2020 QMCPACK developers. // -// File developed by: Mark Dewing, markdewing@gmail.com, University of Illinois at Urbana-Champaign +// File developed by: Peter Doak, doakpw@ornl.gov, Oak Ridge National Lab +// Mark Dewing, markdewing@gmail.com, University of Illinois at Urbana-Champaign // // File created by: Mark Dewing, markdewing@gmail.com, University of Illinois at Urbana-Champaign ////////////////////////////////////////////////////////////////////////////////////// #include "catch.hpp" +#include +#include "QMCDrivers/WalkerProperties.h" #include "Configuration.h" #include "Particle/MCWalkerConfiguration.h" #include "Particle/HDFWalkerOutput.h" @@ -24,13 +27,15 @@ using std::string; namespace qmcplusplus { -typedef Walker Walker_t; + +using MCPWalker = Walker; +using WP = WalkerProperties::Indexes; TEST_CASE("walker", "[particle]") { OHMMS::Controller->initialize(0, NULL); - Walker_t w(1); + MCPWalker w(1); REQUIRE(w.R.size() == 1); w.R[0] = 1.0; @@ -44,7 +49,7 @@ TEST_CASE("walker", "[particle]") TEST_CASE("walker assumptions", "[particle]") { using WP = WalkerProperties::Indexes; - Walker_t w1(1); + MCPWalker w1(1); REQUIRE(w1.Properties.cols() == WP::NUMPROPERTIES); } @@ -53,10 +58,10 @@ TEST_CASE("walker HDF read and write", "[particle]") OHMMS::Controller->initialize(0, NULL); Communicate* c = OHMMS::Controller; - Walker_t w1(1); + MCPWalker w1(1); w1.R[0] = 1.0; - Walker_t w2(1); + MCPWalker w2(1); w2.R[0] = 0.5; MCWalkerConfiguration W; @@ -110,4 +115,28 @@ TEST_CASE("walker HDF read and write", "[particle]") } } +TEST_CASE("walker buffer add, update, restore", "[particle]") +{ + int num_particles = 4; + + UPtrVector walkers(2); + auto createWalker = [num_particles](UPtr& walker_ptr) { + walker_ptr = std::make_unique(num_particles); + walker_ptr->registerData(); + walker_ptr->DataSet.allocate(); + }; + std::for_each(walkers.begin(), walkers.end(), createWalker); + + walkers[0]->Properties(WP::LOGPSI) = 1.2; + walkers[0]->Properties(WP::SIGN) = 1.3; + walkers[0]->Properties(WP::UMBRELLAWEIGHT) = 1.4; + walkers[0]->Properties(WP::LOCALPOTENTIAL) = 1.6; + walkers[0]->updateBuffer(); + + std::memcpy(walkers[1]->DataSet.data(), walkers[0]->DataSet.data(), walkers[0]->DataSet.size()); + walkers[1]->copyFromBuffer(); + CHECK(walkers[1]->Properties(WP::LOGPSI) == Approx(1.2)); + CHECK(walkers[1]->Properties(WP::LOCALPOTENTIAL) == Approx(1.6)); +} + } // namespace qmcplusplus diff --git a/src/QMCApp/QMCMain.cpp b/src/QMCApp/QMCMain.cpp index a29823c9e7..69a7ef2553 100644 --- a/src/QMCApp/QMCMain.cpp +++ b/src/QMCApp/QMCMain.cpp @@ -548,7 +548,7 @@ bool QMCMain::runQMC(xmlNodePtr cur, bool reuse) if(!population_) { - population_.reset(new MCPopulation(myComm->size(), *qmcSystem, ptclPool->getParticleSet("e"), psiPool->getPrimary(), hamPool->getPrimary())); + population_.reset(new MCPopulation(myComm->size(), *qmcSystem, ptclPool->getParticleSet("e"), psiPool->getPrimary(), hamPool->getPrimary(), myComm->rank())); } if (reuse) qmc_driver = std::move(last_driver); diff --git a/src/QMCApp/tests/test_QMCDriverFactory.cpp b/src/QMCApp/tests/test_QMCDriverFactory.cpp index 4414c5ba79..1aa35f9ab6 100644 --- a/src/QMCApp/tests/test_QMCDriverFactory.cpp +++ b/src/QMCApp/tests/test_QMCDriverFactory.cpp @@ -60,7 +60,7 @@ TEST_CASE("QMCDriverFactory create VMC Driver", "[qmcapp]") std::unique_ptr qmc_driver; MCPopulation population(comm->size(), particle_pool.getParticleSet("e"), wavefunction_pool.getPrimary(), - hamiltonian_pool.getPrimary()); + hamiltonian_pool.getPrimary(), comm->rank()); qmc_driver = driver_factory.newQMCDriver(std::move(last_driver), 0, node, das, *qmc_system, particle_pool, wavefunction_pool, hamiltonian_pool, population, comm); @@ -96,7 +96,7 @@ TEST_CASE("QMCDriverFactory create VMCBatched driver", "[qmcapp]") std::unique_ptr last_driver; std::unique_ptr qmc_driver; MCPopulation population(comm->size(), particle_pool.getParticleSet("e"), wavefunction_pool.getPrimary(), - hamiltonian_pool.getPrimary()); + hamiltonian_pool.getPrimary(), comm->rank()); qmc_driver = driver_factory.newQMCDriver(std::move(last_driver), 0, node, das, *qmc_system, particle_pool, wavefunction_pool, hamiltonian_pool, population, comm); @@ -132,7 +132,7 @@ TEST_CASE("QMCDriverFactory create DMC driver", "[qmcapp]") std::unique_ptr last_driver; std::unique_ptr qmc_driver; MCPopulation population(comm->size(), particle_pool.getParticleSet("e"), wavefunction_pool.getPrimary(), - hamiltonian_pool.getPrimary()); + hamiltonian_pool.getPrimary(),comm->rank()); qmc_driver = driver_factory.newQMCDriver(std::move(last_driver), 0, node, das, *qmc_system, particle_pool, wavefunction_pool, hamiltonian_pool, population, comm); @@ -168,7 +168,7 @@ TEST_CASE("QMCDriverFactory create DMCBatched driver", "[qmcapp]") std::unique_ptr last_driver; std::unique_ptr qmc_driver; MCPopulation population(comm->size(), particle_pool.getParticleSet("e"), wavefunction_pool.getPrimary(), - hamiltonian_pool.getPrimary()); + hamiltonian_pool.getPrimary(),comm->rank()); qmc_driver = driver_factory.newQMCDriver(std::move(last_driver), 0, node, das, *qmc_system, particle_pool, wavefunction_pool, hamiltonian_pool, population, comm); diff --git a/src/QMCApp/tests/test_QMCDriverFactory_CUDA.cpp b/src/QMCApp/tests/test_QMCDriverFactory_CUDA.cpp index dc10fc3c60..b7fb80b2cf 100644 --- a/src/QMCApp/tests/test_QMCDriverFactory_CUDA.cpp +++ b/src/QMCApp/tests/test_QMCDriverFactory_CUDA.cpp @@ -77,7 +77,7 @@ TEST_CASE("QMCDriverFactory create VMC_CUDA Driver", "[qmcapp]") MCWalkerConfiguration* qmc_system = particle_pool.getWalkerSet(target); MCPopulation population(comm->size(), particle_pool.getParticleSet("e"), wavefunction_pool.getPrimary(), - hamiltonian_pool.getPrimary()); + hamiltonian_pool.getPrimary(), comm->rank()); std::unique_ptr last_driver; std::unique_ptr qmc_driver; diff --git a/src/QMCDrivers/Crowd.cpp b/src/QMCDrivers/Crowd.cpp index ed79b615d7..913a1c65cc 100644 --- a/src/QMCDrivers/Crowd.cpp +++ b/src/QMCDrivers/Crowd.cpp @@ -41,15 +41,8 @@ void Crowd::addWalker(MCPWalker& walker, ParticleSet& elecs, TrialWaveFunction& void Crowd::loadWalkers() { - auto it_walker = mcp_walkers_.begin(); - auto it_walker_elecs = walker_elecs_.begin(); - //flex walkers here - while (it_walker != mcp_walkers_.end()) - { - (*it_walker_elecs).get().loadWalker(*it_walker, true); - ++it_walker; - ++it_walker_elecs; - } + for(int i = 0; i < mcp_walkers_.size(); ++i) + walker_elecs_[i].get().loadWalker(mcp_walkers_[i], true); } void Crowd::setRNGForHamiltonian(RandomGenerator_t& rng) diff --git a/src/QMCDrivers/DMC/DMCBatched.cpp b/src/QMCDrivers/DMC/DMCBatched.cpp index cd8c611c81..3d9967e70a 100644 --- a/src/QMCDrivers/DMC/DMCBatched.cpp +++ b/src/QMCDrivers/DMC/DMCBatched.cpp @@ -222,7 +222,7 @@ void DMCBatched::advanceWalkers(const StateForThread& sft, //This is just convenient to do here rr_proposed[iw] += rr[iw]; } - + std::transform(delta_r_start, delta_r_end, log_gf.begin(), [](auto& delta_r) { constexpr RealType mhalf(-0.5); return mhalf * dot(delta_r, delta_r); @@ -409,8 +409,8 @@ void DMCBatched::handleMovedWalkers(DMCPerWalkerRefs& moved, const StateForThrea moved.rr_proposed[iw]); // this might mean new_energies are actually unneeded which would be nice. moved.new_energies[iw] = local_energies[iw]; - moved.walkers[iw].get().Weight *= - sft.branch_engine.branchWeightBare(moved.new_energies[iw], moved.old_energies[iw]); + FullPrecRealType branch_weight = sft.branch_engine.branchWeight(moved.new_energies[iw], moved.old_energies[iw]); + moved.walkers[iw].get().Weight *= branch_weight; } timers.collectables_timer.start(); auto evaluateNonPhysicalHamiltonianElements = [](QMCHamiltonian& ham, ParticleSet& pset, MCPWalker& walker) { @@ -457,11 +457,13 @@ void DMCBatched::setMultiplicities(const DMCDriverInput& dmcdriver_input, auto setMultiplicity = [&dmcdriver_input, &rng](MCPWalker& walker) { constexpr RealType onehalf(0.5); constexpr RealType cone(1); - RealType M = walker.Weight; + RealType M; if (walker.Age > dmcdriver_input.get_max_age()) M = std::min(onehalf, M); else if (walker.Age > 0) M = std::min(cone, M); + else + M = walker.Weight; walker.Multiplicity = M + rng(); }; for (int iw = 0; iw < walkers.size(); ++iw) diff --git a/src/QMCDrivers/DMC/DMCDriverInput.h b/src/QMCDrivers/DMC/DMCDriverInput.h index c67d4fb118..b99e6d4eb4 100644 --- a/src/QMCDrivers/DMC/DMCDriverInput.h +++ b/src/QMCDrivers/DMC/DMCDriverInput.h @@ -56,7 +56,7 @@ class DMCDriverInput ///input std::string to use fast gradient std::string UseFastGrad; ///input to control maximum age allowed for walkers. - IndexType max_age_ = 0; + IndexType max_age_ = 10; double alpha_ = 0.0; double gamma_ = 0.0; public: diff --git a/src/QMCDrivers/DMC/WalkerControlMPI.cpp b/src/QMCDrivers/DMC/WalkerControlMPI.cpp index e9f8a06c3f..accd24cd33 100644 --- a/src/QMCDrivers/DMC/WalkerControlMPI.cpp +++ b/src/QMCDrivers/DMC/WalkerControlMPI.cpp @@ -13,6 +13,7 @@ ////////////////////////////////////////////////////////////////////////////////////// +#include #include #include #include @@ -47,7 +48,14 @@ TimerNameList_t DMCMPITimerNames = {{DMC_MPI_branch, "WalkerCont /** default constructor * - * set SwapMode + * set SwapMode? SwapMode is set to 1 but what does that mean? + * This object persists inside the SFNB which also persists + * The zeroing here will not happen in late QMC sections... + * This seems problematic in that NumWalkersSent will start at a + * value of no concern to the current section. + * + * In the new drivers SFNB should throw an except if there is attempted + * reuse of WalkerController */ WalkerControlMPI::WalkerControlMPI(Communicate* c) : WalkerControlBase(c) { @@ -95,6 +103,7 @@ int WalkerControlMPI::branch(int iter, MCWalkerConfiguration& W, FullPrecRealTyp for (int i = 0, j = LE_MAX; i < num_contexts_; i++, j++) NumPerNode[i] = static_cast(curData[j]); int current_population = std::accumulate(NumPerNode.begin(), NumPerNode.end(), 0); + Cur_pop = applyNmaxNmin(current_population); myTimers[DMC_MPI_prebalance]->stop(); myTimers[DMC_MPI_loadbalance]->start(); @@ -139,39 +148,51 @@ int WalkerControlMPI::branch(int iter, MCPopulation& pop, FullPrecRealType trigg myTimers[DMC_MPI_branch]->start(); myTimers[DMC_MPI_prebalance]->start(); std::fill(curData.begin(), curData.end(), 0); + // This has the same ridiculous side effect as SortWalkers + // i.e. it updates most of curData PopulationAdjustment adjust(calcPopulationAdjustment(pop)); //use NumWalkersSent from the previous exchange + //You need another copy because curData is zeroed out defensively. curData[SENTWALKERS_INDEX] = NumWalkersSent; - //update the number of walkers for this node - curData[LE_MAX + MyContext] = adjust.num_walkers; + + //This should not be used by the new driver code + curData[LE_MAX + MyContext] = -1000; myTimers[DMC_MPI_allreduce]->start(); + // You might think we are just reducing LE and sent walkers but + // see calcPopulationAdjustments massive side effects. myComm->allreduce(curData); myTimers[DMC_MPI_allreduce]->stop(); measureProperties(iter); pop.set_ensemble_property(ensemble_property_); - for (int i = 0, j = LE_MAX; i < num_contexts_; i++, j++) - NumPerNode[i] = static_cast(curData[j]); - Cur_pop = applyNmaxNmin(pop.get_num_global_walkers()); + + //All of this should really just accomplish what onRankSpawnKill does for a nonmpi job. + adjustPopulation(pop, adjust); + + auto num_per_node = WalkerControlBase::syncFutureWalkersPerRank(this->getCommunicator(), adjust.num_walkers); + myTimers[DMC_MPI_prebalance]->stop(); myTimers[DMC_MPI_loadbalance]->start(); - swapWalkersSimple(pop, adjust); + swapWalkersSimple(pop, adjust, num_per_node); myTimers[DMC_MPI_loadbalance]->stop(); - // myTimers[DMC_MPI_copyWalkers]->start(); - // copyWalkers(pop); - // myTimers[DMC_MPI_copyWalkers]->stop(); - // //set Weight and Multiplicity to default values - // for (UPtr& walker : pop.get_walkers()) - // { - // walker->Weight = 1.0; - // walker->Multiplicity = 1.0; - // } + + onRankSpawnKill(pop, adjust); + + for (UPtr& walker : pop.get_walkers()) + { + walker->Weight = 1.0; + walker->Multiplicity = 1.0; + } // //update the global number of walkers and offsets - // pop.set_num_global_walkers(Cur_pop); - // pop.set_walker_offsets(FairOffSet); - // myTimers[DMC_MPI_branch]->stop(); - return Cur_pop; + myComm->allreduce(curData); + + // Discover the population now + pop.syncWalkersPerNode(getCommunicator()); + + myTimers[DMC_MPI_branch]->stop(); + + return pop.get_num_global_walkers(); } // determine new walker population on each node @@ -449,22 +470,23 @@ void WalkerControlMPI::swapWalkersSimple(MCWalkerConfiguration& W) } /** swap Walkers between rank MCPopulations - * - * 2005 called and asked for its synchronous messaging back - * I think MPI async is more than two years old * * MCPopulation is sufficiently different from MCWalkerConfiguration that this is - * basically a rewrite. I have not replicated some elements of the original. - * For example: - * * Sending duplicates to the same reciever rank but resorting good walkers if send-receive pair - * changes. - * * Sending a tiny separate blocking message to tell the receiver how many copies + * basically a rewrite. + * \param[inout] pop + * \param[inout] adjust + * + * This method should not be dependent on legacy state variables of + * Cur_pop, NumPernode * */ -void WalkerControlMPI::swapWalkersSimple(MCPopulation& pop, PopulationAdjustment& adjust) +void WalkerControlMPI::swapWalkersSimple(MCPopulation& pop, + PopulationAdjustment& adjust, + std::vector num_per_node) { + int expanded_population = std::accumulate(num_per_node.begin(), num_per_node.end(), 0); std::vector minus, plus; - determineNewWalkerPopulation(Cur_pop, num_contexts_, MyContext, NumPerNode, FairOffSet, minus, plus); + determineNewWalkerPopulation(expanded_population, num_contexts_, MyContext, num_per_node, FairOffSet, plus, minus); if (adjust.good_walkers.empty() && adjust.bad_walkers.empty()) { @@ -474,6 +496,10 @@ void WalkerControlMPI::swapWalkersSimple(MCPopulation& pop, PopulationAdjustment APP_ABORT("WalkerControlMPI::swapWalkersSimple no existing walker"); } + // Looks like a just in case update should be justified. + for (MCPWalker& walker : adjust.good_walkers) + walker.updateBuffer(); + if (plus.size() != minus.size()) { app_error() << "Walker send/recv pattern doesn't match. " @@ -483,9 +509,10 @@ void WalkerControlMPI::swapWalkersSimple(MCPopulation& pop, PopulationAdjustment } // sort good walkers by the number of copies - std::vector>> sorted_good_walkers; + std::vector> sorted_good_walkers; for (int iw = 0; iw < adjust.copies_to_make.size(); iw++) - sorted_good_walkers.push_back(std::make_pair(adjust.copies_to_make[iw], adjust.good_walkers[iw])); + sorted_good_walkers.push_back(std::make_pair(std::ref(adjust.copies_to_make[iw]), adjust.good_walkers[iw])); + // Sort only on the number of copies std::sort(sorted_good_walkers.begin(), sorted_good_walkers.end(), [](auto& a, auto& b) { return a.first > b.first; }); @@ -496,9 +523,9 @@ void WalkerControlMPI::swapWalkersSimple(MCPopulation& pop, PopulationAdjustment for (int ic = 0; ic < nswap; ic++) { - if (plus[ic] == MyContext) + if (minus[ic] == MyContext) ++local_sends; - else if (minus[ic] == MyContext) + else if (plus[ic] == MyContext) ++local_recvs; } @@ -509,118 +536,112 @@ void WalkerControlMPI::swapWalkersSimple(MCPopulation& pop, PopulationAdjustment recv_message_list.reserve(local_recvs); RefVector new_walkers; new_walkers.reserve(local_recvs); - + // Their data needs to not get written over until we are done. + RefVector zombies; for (int ic = 0; ic < nswap; ic++) { - if (plus[ic] == MyContext) + if (minus[ic] == MyContext) { - // always send the last good walker - send_message_list.push_back(WalkerMessage{sorted_good_walkers.back().second, plus[ic], minus[ic]}); - --(sorted_good_walkers.back().first); - if (sorted_good_walkers.back().first == 0) - sorted_good_walkers.pop_back(); + // always send the last good walker or if we're out send the first zombie + if (!sorted_good_walkers.empty()) + { + send_message_list.push_back(WalkerMessage{sorted_good_walkers.back().second, minus[ic], plus[ic]}); + --(sorted_good_walkers.back().first); + if (sorted_good_walkers.back().first < 0) + { + // Danger possible race condition if this dead walker ends up back in the pool + // so temporary refvector for those to be killed. + zombies.push_back(sorted_good_walkers.back().second); + sorted_good_walkers.pop_back(); + } + } + else + { + send_message_list.push_back(WalkerMessage{zombies.front(), minus[ic], plus[ic]}); + app_warning() << "Rank " << myComm->rank() << "Had to send best zombie for population control.\n"; + } } - else if (minus[ic] == MyContext) + else if (plus[ic] == MyContext) { if (adjust.bad_walkers.size() > 0) { pop.killWalker(adjust.bad_walkers.back()); adjust.bad_walkers.pop_back(); } - new_walkers.push_back(pop.spawnWalker()); - recv_message_list.push_back(WalkerMessage{new_walkers.back(), plus[ic], minus[ic]}); + MCPWalker& spawned_walker = *(pop.spawnWalker()); + new_walkers.push_back(spawned_walker); + + recv_message_list.push_back(WalkerMessage{new_walkers.back(), minus[ic], plus[ic]}); } } //create send requests std::vector send_requests; + if (send_message_list.size() > 0) { - std::queue send_message_queue; - auto it_send_messages = send_message_list.begin(); - // Doing depublication of messages - while (it_send_messages != send_message_list.end()) - { - if ((it_send_messages + 1 != send_message_list.end()) && (*(it_send_messages + 1) == *it_send_messages)) - (it_send_messages + 1)->multiplicity++; - else - send_message_queue.push(std::move(*it_send_messages)); - ++it_send_messages; - } - while (!send_message_queue.empty()) - { - WalkerMessage message = send_message_queue.front(); - send_message_queue.pop(); - size_t byteSize = message.walker[0].get().byteSize(); - send_requests.emplace_back( - myComm->comm.isend_n(message.walker[0].get().DataSet.data(), byteSize, message.target_rank)); - } + std::for_each(send_message_list.begin(), send_message_list.end(), [&send_requests, this](WalkerMessage& message) { + MCPWalker& this_walker = message.walker; + std::cout << "send buffer size: " << message.walker.DataSet.size() << '\n'; + send_requests.emplace_back(myComm->comm.isend_n(message.walker.DataSet.data(), + message.walker.DataSet.size(), message.target_rank)); + }); } //create recv requests std::vector recv_requests; - std::vector recv_messages_reduced; if (recv_message_list.size() > 0) { - auto it_recv_messages = recv_message_list.begin(); - // Doing depublication of messages - while (it_recv_messages != recv_message_list.end()) - { - if ((it_recv_messages + 1 != recv_message_list.end()) && (*(it_recv_messages + 1) == *it_recv_messages)) - { - (it_recv_messages + 1)->multiplicity++; - recv_messages_reduced.back().walker.push_back(std::move((*it_recv_messages).walker[0])); - } - else - recv_messages_reduced.push_back(std::move(*it_recv_messages)); - ++it_recv_messages; - } - std::for_each(recv_messages_reduced.begin(), recv_messages_reduced.end(), - [&recv_requests, this](WalkerMessage& message) { - recv_requests.emplace_back(myComm->comm.ireceive_n(message.walker[0].get().DataSet.data(), - message.byteSize, message.source_rank)); - }); + std::for_each(recv_message_list.begin(), recv_message_list.end(), [&recv_requests, this](WalkerMessage& message) { + MCPWalker& walker = message.walker; + std::cout << "recv buffer size: " << message.walker.DataSet.size() << '\n'; + recv_requests.emplace_back(myComm->comm.ireceive_n(message.walker.DataSet.data(), + message.walker.DataSet.size(), + message.source_rank)); + }); } - // Busy until all messages received - std::vector done_with_message(recv_requests.size(), 0); - // while (std::any_of(done_with_message.begin(), done_with_message.end(), [](int i) { return i == 0; })) - // { - if (local_recvs > 0) { + std::cout << "rank: " << MyContext << " local_recv: " << local_recvs << " local_sends: " << local_sends << '\n'; + std::cout << "rank: " << MyContext << " recv_message_list.size(): " << recv_message_list.size() + << " send_message_list.size(): " << send_message_list.size() << '\n'; + + if (local_recvs > 0) + { for (int im = 0; im < recv_requests.size(); ++im) { - recv_requests[im].start(); recv_requests[im].wait(); - // if (done_with_message[im] != 1) - // { - // if (recv_requests[im].completed()) - // { - recv_messages_reduced[im].walker[0].get().copyFromBuffer(); - for (int iw = 1; iw < recv_messages_reduced[im].multiplicity; ++iw) - { - std::memcpy(recv_messages_reduced[im].walker[iw].get().DataSet.data(), - recv_messages_reduced[im].walker[0].get().DataSet.data(), recv_messages_reduced[im].byteSize); - recv_messages_reduced[im].walker[iw].get().copyFromBuffer(); - } - // done_with_message[im] = 1; - //} + MCPWalker& walker_to_check = recv_message_list[im].walker; + recv_message_list[im].walker.copyFromBuffer(); + for (auto property : walker_to_check.Properties) + { + if (std::isnan(property)) + throw std::runtime_error("received property is nan!"); + } } } -// } - if (local_sends > 0) { - // After we've got all our receives wait if we're not done sending. - for (int im = 0; im < send_requests.size(); im++) + + if (local_sends > 0) { + std::vector send_completed(local_sends, 0); + // After we've got all our receives wait if we're not done sending. myTimers[DMC_MPI_send]->start(); - send_requests[im].start(); - send_requests[im].wait(); + while(std::any_of(send_completed.begin(), send_completed.end(), [](int i){ return i == 0; })) + { + for (int im = 0; im < send_requests.size(); im++) + { + send_requests[im].wait(); + if( ! send_completed[im] && send_requests[im].completed() ) + send_completed[im] = 1; + } + } myTimers[DMC_MPI_send]->stop(); } - } - + std::for_each(zombies.begin(), zombies.end(), [&pop](MCPWalker& zombie) { pop.killWalker(zombie); }); + + adjust.num_walkers = pop.get_num_local_walkers(); + NumWalkersSent = local_sends; - } diff --git a/src/QMCDrivers/DMC/WalkerControlMPI.h b/src/QMCDrivers/DMC/WalkerControlMPI.h index 8df78a5195..be7e554281 100644 --- a/src/QMCDrivers/DMC/WalkerControlMPI.h +++ b/src/QMCDrivers/DMC/WalkerControlMPI.h @@ -26,19 +26,17 @@ struct WalkerControlMPITest; namespace testing { -struct UnifiedDriverWalkerControlMPITest; +class UnifiedDriverWalkerControlMPITest; } struct WalkerMessage { // To deal with duplicate send optimization - RefVector walker; + WalkerControlBase::MCPWalker& walker; // i.e. MPI rank const int source_rank; const int target_rank; - int multiplicity = 1; - size_t byteSize = 0; - WalkerMessage(WalkerControlBase::MCPWalker& walk, const int source, const int target) : source_rank(source), target_rank(target){ walker.push_back(walk); }; + WalkerMessage(WalkerControlBase::MCPWalker& walk, const int source, const int target) : walker(walk), source_rank(source), target_rank(target) {} }; inline bool operator==(const WalkerMessage& A, const WalkerMessage& B) @@ -60,18 +58,30 @@ struct WalkerControlMPI : public WalkerControlBase int Cur_min; TimerList_t myTimers; ///Number of walkers sent during the exchange + // Is this persistent state for any reason other than we keep zeroing curData + // defensively? IndexType NumWalkersSent; /** default constructor * * Set the SwapMode to zero so that instantiation can be done + * comm can not be null it is not checked. */ - WalkerControlMPI(Communicate* c = 0); + WalkerControlMPI(Communicate* comm); /** creates the distribution plan * - * the minus and plus vectors contain 1 copy of a partition index for each adjustment - * in population to the context. + * populates the minus and plus vectors they contain 1 copy of a partition index + * for each adjustment in population to the context. + * \todo fix this word snalad + * + * \param[in] Cur_pop population taking multiplicity into account + * \param[in] NumContexts number of MPI processes + * \param[in] MyContext my MPI rank + * \param[in] NumPerNode as if all walkers were copied out to multiplicity + * \param[out] FairOffSet running population count at each partition boundary + * \param[out] minus list of partition indexes one occurance for each walker removed + * \param[out] plus list of partition indexes one occurance for each walker added */ static void determineNewWalkerPopulation(int Cur_pop, int NumContexts, @@ -90,7 +100,13 @@ struct WalkerControlMPI : public WalkerControlBase //current implementations void swapWalkersSimple(MCWalkerConfiguration& W); - void swapWalkersSimple(MCPopulation& pop, PopulationAdjustment& adjust); + /** Unified Driver Implementation + * + * \param[inout] local population + * \param[inout] adjust population adjustment, it's updated for so on node adjustments can occur + * \param[in] number of walkers per node if expanded by multiplicity. + */ + void swapWalkersSimple(MCPopulation& pop, PopulationAdjustment& adjust, std::vector num_per_node); // Testing wrappers friend WalkerControlMPITest; diff --git a/src/QMCDrivers/MCPopulation.cpp b/src/QMCDrivers/MCPopulation.cpp index 942469f090..1746e08f2b 100644 --- a/src/QMCDrivers/MCPopulation.cpp +++ b/src/QMCDrivers/MCPopulation.cpp @@ -19,7 +19,7 @@ namespace qmcplusplus { -MCPopulation::MCPopulation() : trial_wf_(nullptr), elec_particle_set_(nullptr), hamiltonian_(nullptr) {} +MCPopulation::MCPopulation() : trial_wf_(nullptr), elec_particle_set_(nullptr), hamiltonian_(nullptr), num_ranks_(1), rank_(0) {} MCPopulation::MCPopulation(int num_ranks, MCWalkerConfiguration& mcwc, @@ -27,7 +27,7 @@ MCPopulation::MCPopulation(int num_ranks, TrialWaveFunction* trial_wf, QMCHamiltonian* hamiltonian, int this_rank) - : num_ranks_(num_ranks), num_local_walkers_per_node_(num_ranks, 0), trial_wf_(trial_wf), elec_particle_set_(elecs), hamiltonian_(hamiltonian), rank_(this_rank) + : num_local_walkers_per_node_(num_ranks, 0), trial_wf_(trial_wf), elec_particle_set_(elecs), hamiltonian_(hamiltonian), num_ranks_(num_ranks), rank_(this_rank) { walker_offsets_ = mcwc.WalkerOffsets; num_global_walkers_ = mcwc.GlobalNumWalkers; @@ -57,16 +57,16 @@ MCPopulation::MCPopulation(int num_ranks, } MCPopulation::MCPopulation(int num_ranks, - ParticleSet* elecs, - TrialWaveFunction* trial_wf, - QMCHamiltonian* hamiltonian, - int this_rank) - : num_ranks_(num_ranks), - num_particles_(elecs->R.size()), + ParticleSet* elecs, + TrialWaveFunction* trial_wf, + QMCHamiltonian* hamiltonian, + int this_rank) + : num_particles_(elecs->R.size()), num_local_walkers_per_node_(num_ranks, 0), trial_wf_(trial_wf), elec_particle_set_(elecs), hamiltonian_(hamiltonian), + num_ranks_(num_ranks), rank_(this_rank) {} @@ -75,15 +75,6 @@ MCPopulation::MCPopulation(int num_ranks, */ void MCPopulation::createWalkers() { createWalkers(num_local_walkers_); } -void MCPopulation::createWalkerInplace(UPtr& walker_ptr) -{ - //SO this would be where the walker reuse hack would go - walker_ptr = std::make_unique(num_particles_); - walker_ptr->R = elec_particle_set_->R; - walker_ptr->registerData(); - walker_ptr->Properties = elec_particle_set_->Properties; -} - /** we could also search for walker_ptr */ void MCPopulation::allocateWalkerStuffInplace(int walker_index) @@ -93,24 +84,28 @@ void MCPopulation::allocateWalkerStuffInplace(int walker_index) walkers_[walker_index]->DataSet.allocate(); } -/** Creates walkers with starting positions pos and a clone of the electron particle set and trial wavefunction - * - * Needed - * in: DataSet.size() - * in. - */ void MCPopulation::createWalkers(IndexType num_walkers) { num_local_walkers_ = num_walkers; // Ye: need to resize walker_t and ParticleSet Properties + // Really MCPopulation does not own this elec_particle_set_ seems like it should be immutable elec_particle_set_->Properties.resize(1, elec_particle_set_->PropertyList.size()); + // This pattern is begging for a micro benchmark, is this really better + // than the simpler walkers_.pushback; walkers_.resize(num_walkers); - + auto createWalker = [this](UPtr& walker_ptr) { + walker_ptr = std::make_unique(num_particles_); + walker_ptr->R = elec_particle_set_->R; + // Side effect of this changes size of walker_ptr->Properties if done after registerData() you end up with + // a bad buffer. + walker_ptr->Properties = elec_particle_set_->Properties; + walker_ptr->registerData(); + + }; + for (auto& walker_ptr : walkers_) - { - createWalkerInplace(walker_ptr); - } + createWalker(walker_ptr); int num_walkers_created = 0; for (auto& walker_ptr : walkers_) @@ -168,7 +163,7 @@ void MCPopulation::createWalkers(IndexType num_walkers) * It would be better if this could be done just by * reusing memory. */ -MCPopulation::MCPWalker& MCPopulation::spawnWalker() +MCPopulation::MCPWalker* MCPopulation::spawnWalker() { ++num_local_walkers_; outputManager.pause(); @@ -187,20 +182,31 @@ MCPopulation::MCPWalker& MCPopulation::spawnWalker() { walkers_.push_back(std::move(dead_walkers_.back())); dead_walkers_.pop_back(); + //This will only work if there has been no change in the data structuring of Walker since createWalkers + walkers_.back()->DataSet.clear(); + walkers_.back()->registerData(); makeDependentObjects(); + //Here we assume no allocate is necessary since there should have been no changes in the other walker + //elements since createWalkers, yet it must be called + walkers_.back()->DataSet.allocate(); + walkers_.back()->Multiplicity = 1.0; + walkers_.back()->Weight = 1.0; } else { walkers_.push_back(std::make_unique(num_particles_)); walkers_.back()->R = elec_particle_set_->R; - walkers_.back()->registerData(); walkers_.back()->Properties = elec_particle_set_->Properties; + walkers_.back()->DataSet.clear(); + walkers_.back()->registerData(); makeDependentObjects(); walkers_.back()->DataSet.allocate(); + walkers_.back()->Multiplicity = 1.0; + walkers_.back()->Weight = 1.0; } outputManager.resume(); - return *(walkers_.back()); + return walkers_.back().get(); } /** Kill last walker @@ -219,7 +225,6 @@ void MCPopulation::killLastWalker() */ void MCPopulation::killWalker(MCPWalker& walker) { - --num_local_walkers_; // find the walker and null its pointer in the walker vector auto it_walkers = walkers_.begin(); auto it_psets = walker_elec_particle_sets_.begin(); @@ -235,6 +240,7 @@ void MCPopulation::killWalker(MCPWalker& walker) walker_elec_particle_sets_.erase(it_psets); walker_trial_wavefunctions_.erase(it_twfs); walker_hamiltonians_.erase(it_hams); + --num_local_walkers_; return; } ++it_walkers; diff --git a/src/QMCDrivers/MCPopulation.h b/src/QMCDrivers/MCPopulation.h index 8551b4f2b4..3f21378426 100644 --- a/src/QMCDrivers/MCPopulation.h +++ b/src/QMCDrivers/MCPopulation.h @@ -24,11 +24,12 @@ #include "Particle/Walker.h" #include "OhmmsPETE/OhmmsVector.h" #include "QMCWaveFunctions/TrialWaveFunction.h" +#include "QMCHamiltonians/QMCHamiltonian.h" namespace qmcplusplus { -class QMCHamiltonian; + class MCPopulation { @@ -38,12 +39,12 @@ class MCPopulation using RealType = QMCTraits::RealType; using Properties = MCPWalker::PropertyContainer_t; using IndexType = QMCTraits::IndexType; - + using FullPrecRealType = QMCTraits::FullPrecRealType; + private: // Potential thread safety issue MCDataType ensemble_property_; - int num_ranks_ = 0; IndexType num_global_walkers_ = 0; IndexType num_local_walkers_ = 0; IndexType num_particles_ = 0; @@ -81,8 +82,10 @@ class MCPopulation UPtrVector walker_elec_particle_sets_; UPtrVector walker_trial_wavefunctions_; UPtrVector walker_hamiltonians_; - // This should perhaps just be aquired from comm but currently MCPopulation - // is innocent of comm, Every object needing a copy is suboptimal. + + // MCPopulation immutables + // would be nice if they were const but we'd lose the default move assignment + int num_ranks_; int rank_; public: @@ -95,25 +98,29 @@ class MCPopulation ParticleSet* elecs, TrialWaveFunction* trial_wf, QMCHamiltonian* hamiltonian_, - int this_rank = 0); + int this_rank); MCPopulation(int num_ranks, ParticleSet* elecs, TrialWaveFunction* trial_wf, QMCHamiltonian* hamiltonian, - int this_rank = 0); + int this_rank); - MCPopulation(MCPopulation&) = default; + MCPopulation(MCPopulation&) = delete; + MCPopulation& operator=(MCPopulation&) = delete; MCPopulation(MCPopulation&&) = default; MCPopulation& operator=(MCPopulation&&) = default; + ~MCPopulation() { + std::cout << "MCPopulation::~MCPopulation\n"; + } /** @ingroup PopulationControl * * State Requirement: * * createWalkers must have been called * @{ */ - MCPWalker& spawnWalker(); + MCPWalker* spawnWalker(); void killWalker(MCPWalker&); void killLastWalker(); void createWalkerInplace(UPtr& walker_ptr); @@ -121,6 +128,8 @@ class MCPopulation /** }@ */ void createWalkers(); + /** Creates walkers with a clone of the golden electron particle set and golden trial wavefunction + */ void createWalkers(IndexType num_walkers); void createWalkers(int num_crowds_, int num_walkers_per_crowd_, @@ -131,7 +140,7 @@ class MCPopulation /** puts walkers and their "cloned" things into groups in a somewhat general way * * Should compile only if ITER is a proper input ITERATOR - * Will crash if ITER does point to a std::unique_ptr + * Will crash if ITER does point to a std::unqiue_ptr * */ template> @@ -172,6 +181,7 @@ class MCPopulation */ //IndexType get_active_walkers() const { return walkers_.size(); } int get_num_ranks() const { return num_ranks_; } + int get_rank() const { return rank_; } IndexType get_num_global_walkers() const { return num_global_walkers_; } IndexType get_num_local_walkers() const { return num_local_walkers_; } IndexType get_num_particles() const { return num_particles_; } @@ -181,6 +191,7 @@ class MCPopulation //const Properties& get_properties() const { return properties_; } const SpeciesSet& get_species_set() const { return species_set_; } const ParticleSet& get_ions() const { return ions_; } + const ParticleSet* get_golden_electrons() const {return elec_particle_set_; } const std::vector& get_walker_offsets() const { return walker_offsets_; } std::vector get_num_local_walkers_per_node() const { return num_local_walkers_per_node_; } void syncWalkersPerNode(Communicate* comm); diff --git a/src/QMCDrivers/QMCDriverInput.cpp b/src/QMCDrivers/QMCDriverInput.cpp index 83024969b5..255154479b 100644 --- a/src/QMCDrivers/QMCDriverInput.cpp +++ b/src/QMCDrivers/QMCDriverInput.cpp @@ -64,7 +64,8 @@ void QMCDriverInput::readXML(xmlNodePtr cur) parameter_set.add(blocks_between_recompute_, "blocks_between_recompute", "int"); parameter_set.add(drift_modifier_, "drift_modifier", "string"); parameter_set.add(drift_modifier_unr_a_, "drift_UNR_a", "double"); - + parameter_set.add(max_disp_sq_, "maxDisplSq", "double"); + OhmmsAttributeSet aAttrib; // first stage in from QMCDriverFactory diff --git a/src/QMCDrivers/QMCDriverInput.h b/src/QMCDrivers/QMCDriverInput.h index 063db380db..413eb30f16 100644 --- a/src/QMCDrivers/QMCDriverInput.h +++ b/src/QMCDrivers/QMCDriverInput.h @@ -103,6 +103,9 @@ class QMCDriverInput IndexType k_delay_ = 0; bool reset_random_ = false; + // from QMCUpdateBase + RealType max_disp_sq_ = -1.0; + // for drift modifer std::string drift_modifier_{"UNR"}; RealType drift_modifier_unr_a_ = 1.0; @@ -121,6 +124,7 @@ class QMCDriverInput IndexType get_walkers_per_rank() const { return walkers_per_rank_; } IndexType get_requested_samples() const { return requested_samples_; } IndexType get_sub_steps() const { return sub_steps_; } + RealType get_max_disp_sq() const { return max_disp_sq_; } IndexType get_max_blocks() const { return max_blocks_; } IndexType get_max_steps() const { return max_steps_; } IndexType get_warmup_steps() const { return warmup_steps_; } diff --git a/src/QMCDrivers/QMCDriverNew.cpp b/src/QMCDrivers/QMCDriverNew.cpp index fe9d017618..1c1459e3be 100644 --- a/src/QMCDrivers/QMCDriverNew.cpp +++ b/src/QMCDrivers/QMCDriverNew.cpp @@ -65,6 +65,14 @@ QMCDriverNew::QMCDriverNew(QMCDriverInput&& input, num_crowds_ = input.get_num_crowds(); rotation = 0; + + // This needs to be done here to keep dependency on CrystalLattice out of the QMCDriverInput. + max_disp_sq_ = input.get_max_disp_sq(); + if(max_disp_sq_ < 0) + { + const CrystalLattice& lattice = population.get_golden_electrons()->Lattice; + max_disp_sq_ = lattice.LR_rc * lattice.LR_rc; + } } int QMCDriverNew::addObservable(const std::string& aname) diff --git a/src/QMCDrivers/QMCDriverNew.h b/src/QMCDrivers/QMCDriverNew.h index aebcb97cda..4064468116 100644 --- a/src/QMCDrivers/QMCDriverNew.h +++ b/src/QMCDrivers/QMCDriverNew.h @@ -322,7 +322,7 @@ class QMCDriverNew : public QMCDriverInterface, public MPIObjectBase * @{ */ int num_crowds_; - + RealType max_disp_sq_; ///the number of saved samples IndexType target_samples_; diff --git a/src/QMCDrivers/SimpleFixedNodeBranch.cpp b/src/QMCDrivers/SimpleFixedNodeBranch.cpp index 6a4f7702f6..c7f06ce36c 100644 --- a/src/QMCDrivers/SimpleFixedNodeBranch.cpp +++ b/src/QMCDrivers/SimpleFixedNodeBranch.cpp @@ -1,4 +1,4 @@ -////////////////////////////////////////////////////////////////////////////////////// + ////////////////////////////////////////////////////////////////////////////////////// // This file is distributed under the University of Illinois/NCSA Open Source License. // See LICENSE file in top directory for details. // @@ -26,6 +26,7 @@ #include "Estimators/EstimatorManagerBase.h" #include "QMCDrivers/BranchIO.h" #include "Particle/Reptile.h" +#include "type_traits/template_types.hpp" namespace qmcplusplus { @@ -149,6 +150,7 @@ int SimpleFixedNodeBranch::initWalkerController(MCWalkerConfiguration& walkers, walkers.setWalkerOffsets(nwoff); iParam[B_TARGETWALKERS] = nwoff[ncontexts]; } + // \todo reparsing XML nodes is an antipattern, remove. WalkerController.reset(createWalkerController(iParam[B_TARGETWALKERS], MyEstimator->getCommunicator(), myNode)); if (!BranchMode[B_RESTART]) { @@ -163,6 +165,7 @@ int SimpleFixedNodeBranch::initWalkerController(MCWalkerConfiguration& walkers, { app_log() << "Warmup DMC is done with a fixed population " << iParam[B_TARGETWALKERS] << std::endl; BackupWalkerController = std::move(WalkerController); //save the main controller + // \todo: Not likely to be ok to call reset on a moved from WalkerController! WalkerController.reset( createWalkerController(iParam[B_TARGETWALKERS], MyEstimator->getCommunicator(), myNode, true)); BranchMode.set(B_POPCONTROL, 0); @@ -265,12 +268,13 @@ int SimpleFixedNodeBranch::initWalkerController(MCPopulation& population, bool f //assign current Eref and a large number for variance WalkerController->setTrialEnergy(vParam[SBVP::ETRIAL]); this->reset(); + int allow_flux = 50; // std::min(static_cast(population.get_num_global_walkers() * 0.10), 50); if (fromscratch) { //determine the branch cutoff to limit wild weights based on the sigma and sigmaBound // Was check MCWC's particle set for number of R which I take to mean number of particles // will this assumption change if various spin freedoms also are added to ParticleSet? - setBranchCutoff(vParam[SBVP::SIGMA2], WalkerController->get_target_sigma(), 50, population.get_num_particles()); + setBranchCutoff(vParam[SBVP::SIGMA2], WalkerController->get_target_sigma(), allow_flux, population.get_num_particles()); vParam[SBVP::TAUEFF] = tau * R2Accepted.result() / R2Proposed.result(); } //reset controller @@ -483,10 +487,12 @@ void SimpleFixedNodeBranch::branch(int iter, UPtrVector& crowds, MCPopul vParam[SBVP::EREF] = EnergyHist.mean(); //current mean if (BranchMode[B_USETAUEFF]) vParam[SBVP::TAUEFF] = vParam[SBVP::TAU] * R2Accepted.result() / R2Proposed.result(); + if (BranchMode[B_KILLNODES]) EnergyHist(vParam[SBVP::ENOW] - std::log(wc_ensemble_prop.LivingFraction) / vParam[SBVP::TAUEFF]); else EnergyHist(vParam[SBVP::ENOW]); + if (BranchMode[B_DMCSTAGE]) // main stage { if (BranchMode[B_POPCONTROL]) diff --git a/src/QMCDrivers/SimpleFixedNodeBranch.h b/src/QMCDrivers/SimpleFixedNodeBranch.h index 761a251a9e..cf54641dc2 100644 --- a/src/QMCDrivers/SimpleFixedNodeBranch.h +++ b/src/QMCDrivers/SimpleFixedNodeBranch.h @@ -41,13 +41,14 @@ class EstimatorManagerBase; /** Manages the state of QMC sections and handles population control for DMCs * - * TODO: Remove Estimator dependency, only has come dependency. Express accumulate in + * \todo: Remove Estimator dependency, only has come dependency. Express accumulate in * the actual DMC algorithm (i.e. in DMCBatched.cpp) - * TODO: Remove duplicate reading of Driver XML section with own copies of input + * \todo: Remove duplicate reading of Driver XML section with own copies of input * parameters. - * TODO: Rename, it is the only branching class so its name is too much - * TODO: Use normal types for data members, don't be clever to be clever + * \todo: Rename, it is the only branching class so its name is too much + * \todo: Use normal types for data members, don't be clever, * the parameter enums violate KISS and make debugging annoying + * \todo: Remove as much state as possible. * * QMCDriver object owns a SimpleFixedNodeBranch to keep track of the * progress of a qmc section. It implements several methods to control the @@ -57,7 +58,44 @@ class EstimatorManagerBase; * manages the population (killing and duplicating walkers) and * load balancing among multiple MPI tasks. * \see {http://qmcpack.cmscc.org/qmc-basics} - */ + * + * Steps in 'Legacy' SFNB states machine + * 1. Construction (gets global walker number (rank or section wide?) + * 2. setEstimatorManager (also makes bootstrapping SFNB state dependent on valid Communicate*) + * 3. put(reads driver XML node yet again) + * 4. setWalkerController (Maybe a WalkerController pointer is passed in) + * 5. InitWalkerController + * a. Creates walkercontroller if WalkerController is a nullptr + * b. If TargetWalkers isn't known + * aa. allreduce and updates MCMW globalWalkers. + * bb. resets walker offsets + * cc. sets target walkers to whatever current total active walkers is. + * c. resets WalkerController + * d. If not a restart + * aa. saves fixW and killWalker to internal params, otherwise just discards. + * bb. updates SFNB copy of MAX/MINWALKRS from walker controller, + * these were set in constructer but I guess thats ony if this is a restart + * e. setWalkerId + * aa. call start() + * 1. Which calls reset which crucially calculates and update logN state. + * bb. updates all the walker id's of walkers in MCWC. + * 6. checkParameters + * a. getCurrentStatistics from SFNB's estimator + * b. set ETrial, EREF, SIGMA2 from estimator + * c. clear EnergyHist and VarianceHist + * + * Finally branch can be called! It will be called once each step. + * + * 7. call branch (iter and MCMW) + * a. Not first iter during warmup then call WalkerController branch. + * else call WC doNotBranch, returns pop_now + * b. copy a bunch a state from WC to SFNB (should be with respect to pop_now - number of released nodes) + * c. If using taueff update that based on acceptance ration and current tau. + * d. If not warmup calculate ETRIAL based on EREF and feedback * log(TargetWalkers) - log(pop_now) + * e. set WC's TrialEnergy + * d. multiply walkers.Colelctables *= the inverse weight. + * f. call SFNB's estimator accumilator on MCWC + */ struct SimpleFixedNodeBranch : public QMCTraits { typedef SimpleFixedNodeBranch ThisType; diff --git a/src/QMCDrivers/WalkerControlBase.cpp b/src/QMCDrivers/WalkerControlBase.cpp index f7e57fe930..fc9d800376 100644 --- a/src/QMCDrivers/WalkerControlBase.cpp +++ b/src/QMCDrivers/WalkerControlBase.cpp @@ -228,8 +228,8 @@ int WalkerControlBase::doNotBranch(int iter, MCWalkerConfiguration& W) curData[EREF_INDEX] = ecum; curData[R2ACCEPTED_INDEX] = r2_accepted; curData[R2PROPOSED_INDEX] = r2_proposed; - curData[FNSIZE_INDEX] = static_cast(nfn); - curData[RNONESIZE_INDEX] = static_cast(ncr); + curData[FNSIZE_INDEX] = nfn; + curData[RNONESIZE_INDEX] = ncr; curData[RNSIZE_INDEX] = nrn; curData[B_ENERGY_INDEX] = besum; curData[B_WGT_INDEX] = bwgtsum; @@ -304,12 +304,18 @@ int WalkerControlBase::doNotBranch(int iter, MCPopulation& pop) curData[EREF_INDEX] = ecum; curData[R2ACCEPTED_INDEX] = r2_accepted; curData[R2PROPOSED_INDEX] = r2_proposed; - curData[FNSIZE_INDEX] = static_cast(nfn); - curData[RNONESIZE_INDEX] = static_cast(ncr); + curData[FNSIZE_INDEX] = nfn; + curData[RNONESIZE_INDEX] = ncr; curData[RNSIZE_INDEX] = nrn; curData[B_ENERGY_INDEX] = besum; curData[B_WGT_INDEX] = bwgtsum; + // SENTWALKERS and LEMAX should never be other than 0 here + // because in the unified driver we never reuse a WalkerControl + assert( curData[SENTWALKERS_INDEX] == 0.0 ); + for (int i = LE_MAX; i < LE_MAX + num_contexts_; ++i) + assert( curData[i] == 0.0 ); + myComm->allreduce(curData); measureProperties(iter); trialEnergy = ensemble_property_.Energy; @@ -352,40 +358,47 @@ int WalkerControlBase::branch(int iter, MCWalkerConfiguration& W, FullPrecRealTy return nw_tot; } -int WalkerControlBase::branch(int iter, MCPopulation& pop, FullPrecRealType trigger) +void WalkerControlBase::onRankSpawnKill(MCPopulation& pop, PopulationAdjustment& adjust) { - // For measuring properties sortWalkers had important sideeffects - PopulationAdjustment adjust(calcPopulationAdjustment(pop)); - measureProperties(iter); - pop.set_ensemble_property(ensemble_property_); - - // Warning adjustPopulation has many side effects - adjustPopulation(pop, adjust); - // We have not yet updated the local number of walkers - // This happens as a side effect of killing or spawning walkers while (!adjust.bad_walkers.empty()) { - pop.killWalker(adjust.bad_walkers.back()); + //pop.killWalker(adjust.bad_walkers.back()); adjust.bad_walkers.pop_back(); } for (int iw = 0; iw < adjust.good_walkers.size(); ++iw) { for (int i_copies = 0; i_copies < adjust.copies_to_make[iw]; ++i_copies) { - auto& walker = pop.spawnWalker(); - walker = adjust.good_walkers[iw]; + MCPWalker* walker = pop.spawnWalker(); + *walker = adjust.good_walkers[iw]; // IF these are really unique ID's they should be UUID's or something // old algorithm seems to reuse them in a way that I'm not sure avoids // duplicates even at a particular time. - walker.ID = pop.get_num_local_walkers() * num_contexts_ + MyContext; - walker.ParentID = adjust.good_walkers[iw].get().ParentID; + walker->ID = pop.get_num_local_walkers() * pop.get_num_ranks() + pop.get_rank(); + walker->ParentID = adjust.good_walkers[iw].get().ParentID; } } +} - std::for_each(pop.get_walkers().begin(), pop.get_walkers().end(), [](auto& walker) { +int WalkerControlBase::branch(int iter, MCPopulation& pop, FullPrecRealType trigger) +{ + // For measuring properties sortWalkers had important side effects + PopulationAdjustment adjust(calcPopulationAdjustment(pop)); + measureProperties(iter); + pop.set_ensemble_property(ensemble_property_); + + // Warning adjustPopulation has many side effects + adjustPopulation(pop, adjust); + // We have not yet updated the local number of walkers + // This happens as a side effect of killing or spawning walkers + + WalkerControlBase::onRankSpawnKill(pop, adjust); + + for (UPtr& walker : pop.get_walkers()) + { walker->Weight = 1.0; walker->Multiplicity = 1.0; - }); + } pop.syncWalkersPerNode(getCommunicator()); @@ -555,38 +568,39 @@ int WalkerControlBase::sortWalkers(MCWalkerConfiguration& W) */ WalkerControlBase::PopulationAdjustment WalkerControlBase::calcPopulationAdjustment(MCPopulation& pop) { - RefVector walkers(convertUPtrToRefVector(pop.get_walkers())); + UPtrVector& walkers = pop.get_walkers(); // these are equivalent to the good_rn and ncopy_rn in the legacy code - RefVector good_walkers; - std::vector copies_to_make_good_walkers; - PopulationAdjustment adjustment; + + RefVector good_walkers_rn; + std::vector copies_to_make_rn; + adjustment.num_walkers = 0; // So many sums and counters FullPrecRealType esum = 0.0, e2sum = 0.0, wsum = 0.0, ecum = 0.0, w2sum = 0.0, besum = 0.0, bwgtsum = 0.0; FullPrecRealType r2_accepted = 0.0, r2_proposed = 0.0; int nfn(0), nrn(0), ngoodfn(0), ncr(0); - for (MCPWalker& walker : walkers) + for (UPtr& walker : walkers) { - bool inFN = (walker.ReleasedNodeAge == 0); + bool inFN = (walker->ReleasedNodeAge == 0); // Written as a lambda for emphasis, implicit conversion is identical to that of static_cast(...) - auto calcNumberWalkerCopies = [this](int multiplicity) -> int { return std::min(multiplicity, MaxCopy); }; - int nc = calcNumberWalkerCopies(walker.Multiplicity); + auto calcNumberWalkerCopies = [this](int multiplicity) -> int { return std::min(static_cast(multiplicity), MaxCopy); }; + int nc = calcNumberWalkerCopies(walker->Multiplicity); if (write_release_nodes_) { - if (walker.ReleasedNodeAge == 1) + if (walker->ReleasedNodeAge == 1) ncr += 1; - else if (walker.ReleasedNodeAge == 0) + else if (walker->ReleasedNodeAge == 0) { nfn += 1; ngoodfn += nc; } - r2_accepted += walker.Properties(WP::R2ACCEPTED); - r2_proposed += walker.Properties(WP::R2PROPOSED); - FullPrecRealType local_energy(walker.Properties(WP::LOCALENERGY)); - FullPrecRealType alternate_energy(walker.Properties(WP::ALTERNATEENERGY)); - FullPrecRealType wgt = walker.Weight; - FullPrecRealType rnwgt = walker.ReleasedNodeWeight; + r2_accepted += walker->Properties(WP::R2ACCEPTED); + r2_proposed += walker->Properties(WP::R2PROPOSED); + FullPrecRealType local_energy(walker->Properties(WP::LOCALENERGY)); + FullPrecRealType alternate_energy(walker->Properties(WP::ALTERNATEENERGY)); + FullPrecRealType wgt = walker->Weight; + FullPrecRealType rnwgt = walker->ReleasedNodeWeight; esum += wgt * rnwgt * local_energy; e2sum += wgt * rnwgt * local_energy * local_energy; wsum += rnwgt * wgt; @@ -601,10 +615,10 @@ WalkerControlBase::PopulationAdjustment WalkerControlBase::calcPopulationAdjustm nfn++; else ncr++; - r2_accepted += walker.Properties(WP::R2ACCEPTED); - r2_proposed += walker.Properties(WP::R2PROPOSED); - FullPrecRealType local_energy(walker.Properties(WP::LOCALENERGY)); - FullPrecRealType wgt = walker.Weight; + r2_accepted += walker->Properties(WP::R2ACCEPTED); + r2_proposed += walker->Properties(WP::R2PROPOSED); + FullPrecRealType local_energy(walker->Properties(WP::LOCALENERGY)); + FullPrecRealType wgt = walker->Weight; esum += wgt * local_energy; e2sum += wgt * local_energy * local_energy; wsum += wgt; @@ -615,7 +629,7 @@ WalkerControlBase::PopulationAdjustment WalkerControlBase::calcPopulationAdjustm if ((nc) && (inFN)) { adjustment.num_walkers += nc; - adjustment.good_walkers.push_back(walker); + adjustment.good_walkers.push_back(*walker); adjustment.copies_to_make.push_back(nc - 1); } else if (nc) @@ -624,20 +638,20 @@ WalkerControlBase::PopulationAdjustment WalkerControlBase::calcPopulationAdjustm // num_new_walkers != sum(copies_to_make) // qmcpack style would be to use this as an ersatz flag somewhere adjustment.num_walkers += nc; - nrn += nc; // Nothing is every done with this except put its size in - // curData[FNSIZE_INDEX] - good_walkers.push_back(walker); - copies_to_make_good_walkers.push_back(nc - 1); + // curData[FNSIZE_INDEX] which is later used + nrn += nc; + good_walkers_rn.push_back(*walker); + copies_to_make_rn.push_back(nc - 1); } else { - adjustment.bad_walkers.push_back(walker); + adjustment.bad_walkers.push_back(*walker); } } - //temp is an array to perform reduction operations - std::fill(curData.begin(), curData.end(), 0); - //update curData + + //update curData -- this is every field except for SENTWALKERS and LE_MAX (which is the beginning of the hack storing + // number_per_node entries curData[ENERGY_INDEX] = esum; curData[ENERGY_SQ_INDEX] = e2sum; curData[WALKERSIZE_INDEX] = pop.get_walkers().size(); @@ -660,8 +674,8 @@ WalkerControlBase::PopulationAdjustment WalkerControlBase::calcPopulationAdjustm adjustment.good_walkers.push_back(walker); adjustment.copies_to_make.push_back(copies); }; - for (int iw = 0; iw < good_walkers.size(); ++iw) - addWalkersWithReleaseNodeAge(good_walkers[iw], copies_to_make_good_walkers[iw]); + for (int iw = 0; iw < good_walkers_rn.size(); ++iw) + addWalkersWithReleaseNodeAge(good_walkers_rn[iw], copies_to_make_rn[iw]); } return adjustment; } @@ -791,10 +805,11 @@ std::vector WalkerControlBase::syncFutureWalkersPe */ int WalkerControlBase::adjustPopulation(MCPopulation& pop, PopulationAdjustment& adjust) { - // In the unified driver design each ranks MCPopulation knows this and it is not stored a bunch of other places, i.e. NumPerNode shouldn't be how we know. + // In the unified driver design each ranks MCPopulation knows this and it is not + // stored a bunch of other places, i.e. NumPerNode shouldn't be how we know. // Nothing should have touched the walker counts since we synced them at the top of branch - - // What we care about here are the populations we'll have after the adjusts are applied on each rank. + // What we care about here are the populations we'll have after the adjusts are + // applied on each rank. // This differs from the legacy implementation which had partially updated state at this point. auto num_per_node = WalkerControlBase::syncFutureWalkersPerRank(this->getCommunicator(), adjust.num_walkers); IndexType current_population = std::accumulate(num_per_node.begin(), num_per_node.end(), 0); @@ -804,7 +819,7 @@ int WalkerControlBase::adjustPopulation(MCPopulation& pop, PopulationAdjustment& // We assume the difference in number of walkers is no greater than 1 // our algorithm currently insures this. int current_max = (current_population + num_contexts_ - 1) / num_contexts_; - + if (current_max > n_max_) { app_warning() << "Exceeding Max Walkers per MPI rank : " << n_max_ << ". Ceiling is applied" << std::endl; @@ -867,9 +882,10 @@ int WalkerControlBase::adjustPopulation(MCPopulation& pop, PopulationAdjustment& // limit Nmin if (current_population / num_contexts_ < n_min_) { - app_warning() << "The number of walkers is running lower than Min Walkers per MPI rank : " << n_min_ - << ". Floor is applied" << std::endl; int nadd = n_min_ * num_contexts_ - current_population; + app_warning() << "The number of walkers " << (current_population / num_contexts_) + << " is running lower than Min Walkers per MPI rank : " << n_min_ + << ". Floor is applied, adding " << nadd << " walkers." << '\n'; for (int inode = 0; inode < num_contexts_; inode++) { if (num_per_node[inode] > 0 && num_per_node[inode] < n_min_) @@ -898,20 +914,37 @@ int WalkerControlBase::adjustPopulation(MCPopulation& pop, PopulationAdjustment& break; } } + while (nadd) + { + for (int inode = 0; inode < num_contexts_; inode++) + { + if (num_per_node[inode] < n_max_) + { + if (inode == MyContext) + for(int iw = 0; iw < adjust.copies_to_make.size(); iw++) + { + adjust.copies_to_make[iw] += 1; + } + --nadd; + } + } + } if (nadd) // Why is this a warning when the opposition situation is an abort? - app_warning() << "WalkerControlBase::applyNmaxNmin not able to add sufficient walkers overall!" << std::endl; + app_warning() << "WalkerControlBase::adjustPopulation not able to add sufficient walkers overall!" << std::endl; } // check current population current_population = std::accumulate(num_per_node.begin(), num_per_node.end(), 0); + adjust.num_walkers = std::accumulate(adjust.copies_to_make.begin(), adjust.copies_to_make.end(), 0) + adjust.copies_to_make.size(); + // at least one walker after load-balancing if (current_population / num_contexts_ == 0) { app_error() << "Some MPI ranks have no walkers after load balancing. This should not happen." << "Improve the trial wavefunction or adjust the simulation parameters." << std::endl; - APP_ABORT("WalkerControlBase::sortWalkers"); + APP_ABORT("WalkerControlBase::adjustPopulation"); } return current_population; diff --git a/src/QMCDrivers/WalkerControlBase.h b/src/QMCDrivers/WalkerControlBase.h index 4008018c9b..a91c694b19 100644 --- a/src/QMCDrivers/WalkerControlBase.h +++ b/src/QMCDrivers/WalkerControlBase.h @@ -28,6 +28,13 @@ namespace qmcplusplus { + +namespace testing +{ +class UnifiedDriverWalkerControlMPITest; +} + + /** Base class to control the walkers for DMC simulations. * * The virtual functions are implemented for a serial execution with a usual birth/death @@ -189,6 +196,13 @@ class WalkerControlBase : public MPIObjectBase void set_method(IndexType method) { method_ = method; } protected: + /** makes adjustments to local population based on adjust + * + * \param[inout] pop the local population + * \param[in] the population adjustment, it is not updated to reflect local state and is now invalid. + */ + static void onRankSpawnKill(MCPopulation& pop, PopulationAdjustment& adjust); + ///id for the method IndexType method_; ///minimum number of walkers @@ -224,7 +238,7 @@ class WalkerControlBase : public MPIObjectBase IndexType SwapMode; ///any accumulated data over a block std::vector accumData; - ///any temporary data includes many ridiculous conversions of intergrals to and from fp + ///any temporary data includes many ridiculous conversions of integral types to and from fp std::vector curData; ///temporary storage for good and bad walkers std::vector good_w, bad_w; @@ -238,6 +252,7 @@ class WalkerControlBase : public MPIObjectBase ///ensemble properties MCDataType ensemble_property_; + friend class qmcplusplus::testing::UnifiedDriverWalkerControlMPITest; }; } // namespace qmcplusplus diff --git a/src/QMCDrivers/tests/CMakeLists.txt b/src/QMCDrivers/tests/CMakeLists.txt index 16bf810d63..c00895aabd 100644 --- a/src/QMCDrivers/tests/CMakeLists.txt +++ b/src/QMCDrivers/tests/CMakeLists.txt @@ -2,9 +2,10 @@ #// This file is distributed under the University of Illinois/NCSA Open Source License. #// See LICENSE file in top directory for details. #// -#// Copyright (c) 2016 Jeongnim Kim and QMCPACK developers. +#// Copyright (c) 2020 QMCPACK developers. #// -#// File developed by: Ye Luo, yeluo@anl.gov, Argonne National Laboratory +#// File developed by: Peter Doak, doakpw@ornl.gov, Oak Ridge National Laboratory +#// Ye Luo, yeluo@anl.gov, Argonne National Laboratory #// #// File created by: Mark Dewing, markdewing@gmail.com, University of Illinois at Urbana-Champaign #////////////////////////////////////////////////////////////////////////////////////// @@ -28,6 +29,8 @@ IF (NOT QMC_CUDA) SET(DRIVER_TEST_SRC ${DRIVER_TEST_SRC} test_vmc_driver.cpp test_dmc_driver.cpp) ENDIF() +# legacy driver mpi, not tested in a meaningful way since the standard unit test is +# just one rank with mpi. IF(HAVE_MPI) SET(DRIVER_TEST_SRC ${DRIVER_TEST_SRC} test_walker_control.cpp) ENDIF() @@ -41,31 +44,38 @@ SET_TESTS_PROPERTIES(${UTEST_NAME} PROPERTIES WORKING_DIRECTORY ${UTEST_DIR}) IF(NOT QMC_CUDA AND ENABLE_SOA) -# New Driver Only test program -# to avoid dealing with the build dependencies -# and memory leaks brought in by the old driver tests - -SET(UTEST_EXE test_new_${SRC_DIR}) -SET(UTEST_NAME deterministic-unit_test_new_${SRC_DIR}) - -# test_drivers shouldn't be dependent on tests/wavefunctions -SET(UTEST_DIR ${qmcpack_BINARY_DIR}/tests/drivers) -SET(UTEST_HDF_INPUT ${qmcpack_SOURCE_DIR}/tests/solids/diamondC_1x1x1_pp/pwscf.pwscf.h5) -EXECUTE_PROCESS(COMMAND ${CMAKE_COMMAND} -E make_directory "${UTEST_DIR}") -MAYBE_SYMLINK(${UTEST_HDF_INPUT} ${UTEST_DIR}/pwscf.pwscf.h5) - -SET(DRIVER_TEST_SRC SetupPools.cpp test_Crowd.cpp test_MCPopulation.cpp test_ContextForSteps.cpp test_QMCDriverInput.cpp test_QMCDriverNew.cpp test_VMCDriverInput.cpp test_VMCFactoryNew.cpp test_VMCBatched.cpp test_DMCBatched.cpp) - -IF(HAVE_MPI) - SET(DRIVER_TEST_SRC ${DRIVER_TEST_SRC} test_WalkerControlMPI.cpp) -ENDIF() - -ADD_EXECUTABLE(${UTEST_EXE} ${DRIVER_TEST_SRC}) - -USE_FAKE_RNG(${UTEST_EXE}) -TARGET_LINK_LIBRARIES(${UTEST_EXE} qmc qmcdriver_unit qmcham_unit qmcwfs qmcbase qmcutil qmcfakerng ${QMC_UTIL_LIBS} ${MPI_LIBRARY}) - - -ADD_UNIT_TEST(${UTEST_NAME} "${QMCPACK_UNIT_TEST_DIR}/${UTEST_EXE}") -SET_TESTS_PROPERTIES(${UTEST_NAME} PROPERTIES WORKING_DIRECTORY ${UTEST_DIR}) + # New Driver Only test program + # to avoid dealing with the build dependencies + # and memory leaks brought in by the old driver tests + + SET(UTEST_EXE test_new_${SRC_DIR}) + SET(UTEST_NAME deterministic-unit_test_new_${SRC_DIR}) + SET(UTEST_DIR ${qmcpack_BINARY_DIR}/tests/drivers) + SET(UTEST_HDF_INPUT ${qmcpack_SOURCE_DIR}/tests/solids/diamondC_1x1x1_pp/pwscf.pwscf.h5) + #this is dependent on the directory creation and sym linking of earlier driver tests + + SET(DRIVER_TEST_SRC SetupPools.cpp test_Crowd.cpp test_MCPopulation.cpp test_ContextForSteps.cpp test_QMCDriverInput.cpp test_QMCDriverNew.cpp test_VMCDriverInput.cpp test_VMCFactoryNew.cpp test_VMCBatched.cpp test_DMCBatched.cpp test_SimpleFixedNodeBranch.cpp) + ADD_EXECUTABLE(${UTEST_EXE} ${DRIVER_TEST_SRC}) + USE_FAKE_RNG(${UTEST_EXE}) + TARGET_LINK_LIBRARIES(${UTEST_EXE} qmc qmcdriver_unit qmcham_unit qmcwfs qmcbase qmcutil qmcfakerng ${QMC_UTIL_LIBS} ${MPI_LIBRARY}) + + ADD_UNIT_TEST(${UTEST_NAME} "${QMCPACK_UNIT_TEST_DIR}/${UTEST_EXE}") + SET_TESTS_PROPERTIES(${UTEST_NAME} PROPERTIES WORKING_DIRECTORY ${UTEST_DIR}) + + + IF(HAVE_MPI) + SET(UTEST_EXE test_${SRC_DIR}_mpi) + SET(UTEST_NAME deterministic-unit_test_${SRC_DIR}_mpi) + SET(UTEST_DIR ${qmcpack_BINARY_DIR}/tests/drivers) + SET(UTEST_HDF_INPUT ${qmcpack_SOURCE_DIR}/tests/solids/diamondC_1x1x1_pp/pwscf.pwscf.h5) + #this is dependent on the directory creation and sym linking of earlier driver tests + SET(DRIVER_TEST_SRC SetupPools.cpp test_WalkerControlMPI.cpp) + ADD_EXECUTABLE(${UTEST_EXE} ${DRIVER_TEST_SRC}) + USE_FAKE_RNG(${UTEST_EXE}) + #Way too many depenedencies make for very slow test linking + TARGET_LINK_LIBRARIES(${UTEST_EXE} qmc qmcdriver_unit qmcham_unit qmcwfs qmcbase qmcutil qmcfakerng ${QMC_UTIL_LIBS} ${MPI_LIBRARY}) + # Right now the unified driver mpi tests are hard coded for 3 MPI ranks + ADD_MPI_UNIT_TEST(${UTEST_NAME} "${QMCPACK_UNIT_TEST_DIR}/${UTEST_EXE}" 3) + SET_TESTS_PROPERTIES(${UTEST_NAME} PROPERTIES WORKING_DIRECTORY ${UTEST_DIR}) + ENDIF(HAVE_MPI) ENDIF(NOT QMC_CUDA AND ENABLE_SOA) diff --git a/src/QMCDrivers/tests/SetupDMCTest.h b/src/QMCDrivers/tests/SetupDMCTest.h index a97b828bfd..4accb12587 100644 --- a/src/QMCDrivers/tests/SetupDMCTest.h +++ b/src/QMCDrivers/tests/SetupDMCTest.h @@ -45,7 +45,7 @@ class SetupDMCTest : public SetupPools dmcdrv_input.readXML(node); population = MCPopulation(num_ranks, particle_pool->getParticleSet("e"), wavefunction_pool->getPrimary(), - hamiltonian_pool->getPrimary()); + hamiltonian_pool->getPrimary(),comm->rank()); QMCDriverInput qmc_input_copy(qmcdrv_input); DMCDriverInput dmc_input_copy(dmcdrv_input); diff --git a/src/QMCDrivers/tests/SetupPools.cpp b/src/QMCDrivers/tests/SetupPools.cpp index 2964f62deb..a3d420e109 100644 --- a/src/QMCDrivers/tests/SetupPools.cpp +++ b/src/QMCDrivers/tests/SetupPools.cpp @@ -20,7 +20,6 @@ namespace testing { SetupPools::SetupPools() { - OHMMS::Controller->initialize(0, NULL); comm = OHMMS::Controller; std::cout << "SetupPools::SetupPools()\n"; diff --git a/src/QMCDrivers/tests/test_DMCBatched.cpp b/src/QMCDrivers/tests/test_DMCBatched.cpp index b004a38ee8..da18a39ce7 100644 --- a/src/QMCDrivers/tests/test_DMCBatched.cpp +++ b/src/QMCDrivers/tests/test_DMCBatched.cpp @@ -30,7 +30,7 @@ TEST_CASE("DMCBatched::calc_default_local_walkers", "[drivers]") auto testWRTWalkersPerRank = [&dtest](int walkers_per_rank) { - DMCBatched dmc_batched(std::move(dtest())); + DMCBatched dmc_batched = dtest(); dmc_batched.set_walkers_per_rank(walkers_per_rank, "testing"); if (dtest.num_crowds < 8) dmc_batched.set_num_crowds(Concurrency::maxThreads(), "Insufficient threads available to match test input"); diff --git a/src/QMCDrivers/tests/test_MCPopulation.cpp b/src/QMCDrivers/tests/test_MCPopulation.cpp index c5830305e3..f2a3cbf21c 100644 --- a/src/QMCDrivers/tests/test_MCPopulation.cpp +++ b/src/QMCDrivers/tests/test_MCPopulation.cpp @@ -36,7 +36,7 @@ TEST_CASE("MCPopulation::createWalkers", "[particle][population]") HamiltonianPool hamiltonian_pool = mhp(comm, &particle_pool, &wavefunction_pool); TrialWaveFunction twf(comm); - MCPopulation population(1, particle_pool.getParticleSet("e"), &twf, hamiltonian_pool.getPrimary()); + MCPopulation population(1, particle_pool.getParticleSet("e"), &twf, hamiltonian_pool.getPrimary(),comm->rank()); population.createWalkers(8); REQUIRE(population.get_walkers().size() == 8); @@ -71,7 +71,7 @@ TEST_CASE("MCPopulation::distributeWalkers", "[particle][population]") HamiltonianPool hamiltonian_pool = mhp(comm, &particle_pool, &wavefunction_pool); MCPopulation population(1, particle_pool.getParticleSet("e"), wavefunction_pool.getPrimary(), - hamiltonian_pool.getPrimary()); + hamiltonian_pool.getPrimary(), comm->rank()); population.createWalkers(24); REQUIRE(population.get_walkers().size() == 24); diff --git a/src/QMCDrivers/tests/test_QMCDriverNew.cpp b/src/QMCDrivers/tests/test_QMCDriverNew.cpp index 0c7812ec6d..a06e45c163 100644 --- a/src/QMCDrivers/tests/test_QMCDriverNew.cpp +++ b/src/QMCDrivers/tests/test_QMCDriverNew.cpp @@ -25,7 +25,6 @@ namespace qmcplusplus { - TEST_CASE("QMCDriverNew tiny case", "[drivers]") { using namespace testing; @@ -45,10 +44,11 @@ TEST_CASE("QMCDriverNew tiny case", "[drivers]") MinimalWaveFunctionPool wfp; WaveFunctionPool wavefunction_pool = wfp(comm, &particle_pool); wavefunction_pool.setPrimary(wavefunction_pool.getWaveFunction("psi0")); - + MinimalHamiltonianPool mhp; HamiltonianPool hamiltonian_pool = mhp(comm, &particle_pool, &wavefunction_pool); - MCPopulation population(1, particle_pool.getParticleSet("e"), wavefunction_pool.getPrimary(), hamiltonian_pool.getPrimary()); + MCPopulation population(1, particle_pool.getParticleSet("e"), wavefunction_pool.getPrimary(), + hamiltonian_pool.getPrimary(), comm->rank()); QMCDriverNewTestWrapper qmcdriver(std::move(qmcdriver_input), population, *(wavefunction_pool.getPrimary()), *(hamiltonian_pool.getPrimary()), wavefunction_pool, comm); @@ -89,10 +89,11 @@ TEST_CASE("QMCDriverNew integration", "[drivers]") MinimalWaveFunctionPool wfp; WaveFunctionPool wavefunction_pool = wfp(comm, &particle_pool); wavefunction_pool.setPrimary(wavefunction_pool.getWaveFunction("psi0")); - + MinimalHamiltonianPool mhp; HamiltonianPool hamiltonian_pool = mhp(comm, &particle_pool, &wavefunction_pool); - MCPopulation population(4, particle_pool.getParticleSet("e"), wavefunction_pool.getPrimary(), hamiltonian_pool.getPrimary()); + MCPopulation population(4, particle_pool.getParticleSet("e"), wavefunction_pool.getPrimary(), + hamiltonian_pool.getPrimary(), comm->rank()); QMCDriverNewTestWrapper qmcdriver(std::move(qmcdriver_input), population, *(wavefunction_pool.getPrimary()), *(hamiltonian_pool.getPrimary()), wavefunction_pool, comm); diff --git a/src/QMCDrivers/tests/test_SimpleFixedNodeBranch.cpp b/src/QMCDrivers/tests/test_SimpleFixedNodeBranch.cpp new file mode 100644 index 0000000000..a8eb69bf0f --- /dev/null +++ b/src/QMCDrivers/tests/test_SimpleFixedNodeBranch.cpp @@ -0,0 +1,164 @@ +////////////////////////////////////////////////////////////////////////////////////// +// This file is distributed under the University of Illinois/NCSA Open Source License. +// See LICENSE file in top directory for details. +// +// Copyright (c) 2019 QMCPACK developers. +// +// File developed by: Peter Doak, doakpw@ornl.gov, Oak Ridge National Laboratory +// +// File created by: Peter Doak, doakpw@ornl.gov, Oak Ridge National Laboratory +////////////////////////////////////////////////////////////////////////////////////// + +#include + +#include "Message/Communicate.h" + +#include "type_traits/template_types.hpp" +#include "Particle/MCWalkerConfiguration.h" +#include "Particle/Walker.h" +#include "Estimators/EstimatorManagerBase.h" +#include "Estimators/tests/FakeEstimator.h" +#include "QMCDrivers/SimpleFixedNodeBranch.h" +#include "QMCDrivers/MCPopulation.h" +#include "QMCDrivers/Crowd.h" +#include "QMCDrivers/tests/ValidQMCInputSections.h" +#include "QMCDrivers/tests/SetupPools.h" + +namespace qmcplusplus +{ +using MCPWalker = Walker; +using RealType = double; + +namespace testing +{ +class SetupSimpleFixedNodeBranch +{ +public: + SetupSimpleFixedNodeBranch(Communicate* comm) + { + comm_ = comm; + emb_ = std::make_unique(comm_); + FakeEstimator* fake_est = new FakeEstimator; + emb_->add(fake_est, "fake"); + + } + + SetupSimpleFixedNodeBranch() + { + OHMMS::Controller->initialize(0, NULL); + comm_ = OHMMS::Controller; + emb_ = std::make_unique(comm_); + FakeEstimator* fake_est = new FakeEstimator; + emb_->add(fake_est, "fake"); + } + + SimpleFixedNodeBranch operator()() + { + mcwc_ = std::make_unique(); + mcwc_->setName("electrons"); + + mcwc_->create(1); + mcwc_->R[0][0] = 0.0; + mcwc_->R[0][1] = 1.0; + mcwc_->R[0][2] = 2.0; + + MCPWalker w1(1); + w1.R[0] = 1.0; + MCPWalker w2(1); + w2.R[0] = 0.5; + + mcwc_->fakeWalkerList(&w1, &w2); + + SimpleFixedNodeBranch sfnb(tau_, num_global_walkers_); + + sfnb.setEstimatorManager(emb_.get()); + + createMyNode(sfnb, valid_dmc_input_sections[valid_dmc_input_dmc_index]); + + // WalkerController is created as a side effect. + sfnb.initWalkerController(*mcwc_, false, false); + + sfnb.checkParameters(*mcwc_); + + sfnb.branch(0, *mcwc_); + + return sfnb; + } + + SimpleFixedNodeBranch operator()(ParticleSet& p_set, TrialWaveFunction& twf, QMCHamiltonian& ham) + { + pop_ = std::make_unique(1, &p_set, &twf, &ham, comm_->rank()); + // MCPopulation owns it walkers it cannot just take refs so we just create and then update its walkers. + pop_->createWalkers(2); + + RefVector walkers = convertUPtrToRefVector(pop_->get_walkers()); + + walkers[0].get().R[0] = 1.0; + walkers[1].get().R[0] = 0.5; + + SimpleFixedNodeBranch sfnb(tau_, num_global_walkers_); + + sfnb.setEstimatorManager(emb_.get()); + + createMyNode(sfnb, valid_dmc_input_sections[valid_dmc_input_dmc_batch_index]); + + sfnb.initWalkerController(*pop_, false, false); + + + sfnb.checkParameters(pop_->get_num_global_walkers(), walkers); + + UPtrVector crowds; + crowds.emplace_back(std::make_unique(*emb_)); + crowds.emplace_back(std::make_unique(*emb_)); + + sfnb.branch(0, crowds, *pop_); + + return sfnb; + } + +private: + + void createMyNode(SimpleFixedNodeBranch& sfnb, const char* xml) + { + doc_ = std::make_unique(); + doc_->parseFromString(xml); + sfnb.myNode = doc_->getRoot(); + } + + Communicate* comm_; + UPtr emb_; + UPtr pop_; + UPtr doc_; + + RealType tau_ = 1.0; + int num_global_walkers_ = 16; + + //Legacy + UPtr mcwc_; +}; + +} + +TEST_CASE("SimpleFixedNodeBranch::branch(MCWC...)", "[drivers][legacy]") +{ + using namespace testing; + SetupSimpleFixedNodeBranch setup_sfnb; + SimpleFixedNodeBranch sfnb = setup_sfnb(); + + // \todo: check walker ID's here. might need more walkers to make this significant. + +} + +TEST_CASE("SimpleFixedNodeBranch::branch(MCPopulation...)", "[drivers]") +{ + using namespace testing; + SetupPools pools; + SetupSimpleFixedNodeBranch setup_sfnb(pools.comm); + SimpleFixedNodeBranch sfnb = setup_sfnb(*pools.particle_pool->getParticleSet("e"), + *pools.wavefunction_pool->getPrimary(), + *pools.hamiltonian_pool->getPrimary()); + +} + + +} // namespace qmcplusplus diff --git a/src/QMCDrivers/tests/test_VMCBatched.cpp b/src/QMCDrivers/tests/test_VMCBatched.cpp index c94fd9283d..1501af85d8 100644 --- a/src/QMCDrivers/tests/test_VMCBatched.cpp +++ b/src/QMCDrivers/tests/test_VMCBatched.cpp @@ -53,7 +53,7 @@ TEST_CASE("VMCBatched::calc_default_local_walkers", "[drivers]") auto testWRTWalkersPerRank = [&](int walkers_per_rank) { MCPopulation population(num_ranks, particle_pool.getParticleSet("e"), wavefunction_pool.getPrimary(), - hamiltonian_pool.getPrimary()); + hamiltonian_pool.getPrimary(),comm->rank()); QMCDriverInput qmcdriver_copy(qmcdriver_input); VMCDriverInput vmcdriver_input("yes"); VMCBatched vmc_batched(std::move(qmcdriver_copy), std::move(vmcdriver_input), population, diff --git a/src/QMCDrivers/tests/test_WalkerControlMPI.cpp b/src/QMCDrivers/tests/test_WalkerControlMPI.cpp index 731c3a379b..195d9e687f 100644 --- a/src/QMCDrivers/tests/test_WalkerControlMPI.cpp +++ b/src/QMCDrivers/tests/test_WalkerControlMPI.cpp @@ -9,164 +9,189 @@ // File created by: Peter Doak, doakpw@ornl.gov, Oak Ridge National Laboratory ////////////////////////////////////////////////////////////////////////////////////// -#include +#include + +#include "Message/catch_mpi_main.hpp" + +//#include #include "Message/Communicate.h" +#include "QMCDrivers/MCPopulation.h" #include "QMCDrivers/QMCDriverInput.h" -#include "QMCDrivers/DMC/WalkerControlMPI.h" -#include "QMCDrivers/tests/ValidQMCInputSections.h" -#include "QMCDrivers/tests/SetupDMCTest.h" -#include "QMCApp/tests/MinimalParticlePool.h" -#include "QMCApp/tests/MinimalWaveFunctionPool.h" -#include "QMCApp/tests/MinimalHamiltonianPool.h" +#include "QMCDrivers/tests/test_WalkerControlMPI.h" +#include "Utilities/OutputManager.h" -#include "Concurrency/Info.hpp" -#include "Concurrency/UtilityFunctions.hpp" +//#include "Concurrency/Info.hpp" +//#include "Concurrency/UtilityFunctions.hpp" namespace qmcplusplus { namespace testing { -/** Once there is only one driver type rename - */ -struct UnifiedDriverWalkerControlMPITest +UnifiedDriverWalkerControlMPITest::UnifiedDriverWalkerControlMPITest() : wc_(dpools_.comm) { - void operator()() + using namespace testing; + + int num_ranks = dpools_.comm->size(); + pop_ = std::make_unique(num_ranks, dpools_.particle_pool->getParticleSet("e"), + dpools_.wavefunction_pool->getPrimary(), dpools_.hamiltonian_pool->getPrimary(), + dpools_.comm->rank()); + + pop_->createWalkers(1); + + wc_.use_nonblocking = true; + + + // Set up Cur_pop + wc_.Cur_pop = dpools_.comm->size(); + for (int i = 0; i < dpools_.comm->size(); i++) { - using namespace testing; - SetupDMCTest dtest; + wc_.NumPerNode[i] = 1; + } +} - WalkerControlMPI wc(dtest.comm); - // This finishes setup of population as a side effect. - DMCBatched dmc_batched(std::move(dtest())); +void UnifiedDriverWalkerControlMPITest::testMultiplicity(std::vector& rank_counts_expanded, + std::vector& rank_counts_after) +{ + using MCPWalker = MCPopulation::MCPWalker; - dtest.population.createWalkers(1); - - wc.use_nonblocking = true; + int rank = dpools_.comm->rank(); - // Set up Cur_pop - wc.Cur_pop = dtest.comm->size(); - for (int i = 0; i < dtest.comm->size(); i++) - { - wc.NumPerNode[i] = 1; - } + // Currently some/all this duplicate state is necessary to have a a successful swap. + // \todo remove duplicate state effecting population control - using MCPWalker = MCPopulation::MCPWalker; - // One walker on every node, should be no swapping - - WalkerControlBase::PopulationAdjustment pop_adjust{1, - convertUPtrToRefVector(dtest.population.get_walkers()), - {1}, - RefVector{}}; - wc.swapWalkersSimple(dtest.population, pop_adjust); + pop_->get_walkers()[0]->Multiplicity = rank_counts_expanded[rank]; + int future_pop = std::accumulate(rank_counts_expanded.begin(), rank_counts_expanded.end(), 0); + WalkerControlBase::PopulationAdjustment pop_adjust{future_pop, + convertUPtrToRefVector(pop_->get_walkers()), + {rank_counts_expanded[rank] - 1}, + RefVector{}}; - REQUIRE(dtest.population.get_num_local_walkers() == 1); + reportWalkersPerRank(dpools_.comm, *pop_); + wc_.swapWalkersSimple(*pop_, pop_adjust, rank_counts_expanded); + reportWalkersPerRank(dpools_.comm, *pop_); + CHECK(pop_->get_num_local_walkers() == rank_counts_after[rank]); +} - // add two walkers walkers on rank 0 - if (dtest.comm->rank() == 0) - { - // Use the ID variable to check that the walker content was transmitted - pop_adjust.good_walkers.push_back(dtest.population.spawnWalker()); - pop_adjust.good_walkers.back().get().ID = dtest.comm->size(); - pop_adjust.copies_to_make.push_back(0); - pop_adjust.good_walkers.push_back(dtest.population.spawnWalker()); - pop_adjust.good_walkers.back().get().ID = dtest.comm->size() + 1; - pop_adjust.copies_to_make.push_back(0); - } - wc.NumPerNode[0] = 3; - wc.Cur_pop += 2; +void UnifiedDriverWalkerControlMPITest::testPopulationDiff(std::vector& rank_counts_before, + std::vector& rank_counts_after) +{ + using MCPWalker = MCPopulation::MCPWalker; - reportWalkersPerRank(dtest.comm, dtest.population); - wc.swapWalkersSimple(dtest.population, pop_adjust); + int rank = dpools_.comm->rank(); - //std::cout << " Rank = " << c->rank() << " good size = " << wc.good_w.size() << - // " ID = " << wc.good_w[0]->ID << std::endl; + pop_->get_walkers()[0]->Multiplicity = rank_counts_before[rank]; - reportWalkersPerRank(dtest.comm, dtest.population); + WalkerControlBase::PopulationAdjustment pop_adjust{rank_counts_before[rank], + convertUPtrToRefVector(pop_->get_walkers()), + {rank_counts_before[rank] - 1}, + RefVector{}}; - if (dtest.comm->size() > 1) - { - if (dtest.comm->rank() == dtest.comm->size() - 2) - { - REQUIRE(dtest.population.get_num_local_walkers() == 2); - // This check is a bit too restrictive - no guarantee the last walker was the - // one transmitted - bool okay1 = dtest.population.get_walkers()[1]->ID == dtest.comm->size() || - dtest.population.get_walkers()[1]->ID == dtest.comm->size() + 1; - REQUIRE(okay1); - } - else if (dtest.comm->rank() == dtest.comm->size() - 1) - { - REQUIRE(dtest.population.get_num_local_walkers() == 2); - bool okay2 = dtest.population.get_walkers()[1]->ID == dtest.comm->size() || - dtest.population.get_walkers()[1]->ID == dtest.comm->size() + 1; - REQUIRE(okay2); - } - else - { - REQUIRE(dtest.population.get_num_local_walkers() == 1); - REQUIRE(dtest.population.get_walkers()[0]->ID == dtest.comm->rank()); - } - } + wc_.adjustPopulation(*pop_, pop_adjust); + WalkerControlBase::onRankSpawnKill(*pop_, pop_adjust); + + + wc_.Cur_pop = std::accumulate(rank_counts_before.begin(), rank_counts_before.end(), 0); - // And now the strange case - // 6 walkers on rank0, 2 on rank1, 2 on rank2 - if (dtest.comm->size() > 2) + auto proper_number_copies = [](int size) -> std::vector { return std::vector(size, 0); }; + WalkerControlBase::PopulationAdjustment pop_adjust2{rank_counts_before[rank], + convertUPtrToRefVector(pop_->get_walkers()), + proper_number_copies(pop_->get_num_local_walkers()), + RefVector{}}; + + auto num_per_node = WalkerControlBase::syncFutureWalkersPerRank(dpools_.comm, pop_->get_num_local_walkers()); + + reportWalkersPerRank(dpools_.comm, *pop_); + wc_.swapWalkersSimple(*pop_, pop_adjust2, num_per_node); + reportWalkersPerRank(dpools_.comm, *pop_); + CHECK(pop_->get_num_local_walkers() == rank_counts_after[rank]); +} + +void UnifiedDriverWalkerControlMPITest::reportWalkersPerRank(Communicate* c, MCPopulation& pop) +{ +#if !defined(NDEBUG) + std::vector rank_walker_count(c->size(), 0); + rank_walker_count[c->rank()] = pop.get_num_local_walkers(); + c->allreduce(rank_walker_count); + + if (c->rank() == 0) + { + std::cout << "Walkers Per Rank (Total: " << wc_.Cur_pop << ")\n"; + for (int i = 0; i < rank_walker_count.size(); ++i) { - if (dtest.comm->rank() == 0) - { - pop_adjust.good_walkers.push_back(dtest.population.spawnWalker()); - pop_adjust.good_walkers.back().get().ID = dtest.comm->size() + 3; - pop_adjust.copies_to_make.push_back(2); - pop_adjust.good_walkers.push_back(dtest.population.spawnWalker()); - pop_adjust.good_walkers.back().get().ID = dtest.comm->size() + 2; - pop_adjust.copies_to_make.push_back(1); - } - wc.NumPerNode[0] = 6; - wc.Cur_pop += 5; - - reportWalkersPerRank(dtest.comm, dtest.population); - wc.swapWalkersSimple(dtest.population, pop_adjust); - reportWalkersPerRank(dtest.comm, dtest.population); - // These are unique walkers - if (dtest.comm->rank() == dtest.comm->size() - 2) - { - CHECK(dtest.population.get_num_local_walkers() == 3); - } - else if (dtest.comm->rank() == dtest.comm->size() - 1) - { - CHECK(dtest.population.get_num_local_walkers() == 4); - } - else - { - CHECK(dtest.population.get_num_local_walkers() == 3); - } + std::cout << " " << i << " " << rank_walker_count[i] << '\n'; } } - private: - void reportWalkersPerRank(Communicate* c, MCPopulation& pop) +#endif +} + +} // namespace testing + +TEST_CASE("WalkerControlMPI::determineNewWalkerPopulation", "[drivers][walker_control]") +{ + int cur_pop = 5; + int num_contexts = 3; + + std::vector num_per_node = {3, 1, 1}; + + for (int i = 0; i < num_contexts; ++i) { - std::vector rank_walker_count(c->size(), 0); - rank_walker_count[c->rank()] = pop.get_num_local_walkers(); - c->allreduce(rank_walker_count); - if (c->rank() == 0) - { - std::cout << "Walkers Per Rank (Total: " << pop.get_num_global_walkers() << ")\n"; - for (int i = 0; i < rank_walker_count.size(); ++i) - { - std::cout << " " << i << " " << rank_walker_count[i] << "\n"; - } - } + std::vector fair_offset; + std::vector minus; + std::vector plus; + WalkerControlMPI::determineNewWalkerPopulation(cur_pop, num_contexts, i, num_per_node, fair_offset, minus, plus); + + CHECK(minus.size() == 2); + CHECK(plus.size() == 2); + } +} + +/** Here we manipulate just the Multiplicity of a set of 1 walkers per rank + */ +TEST_CASE("MPI WalkerControl multiplicity swap walkers", "[drivers][walker_control]") +{ + outputManager.pause(); + testing::UnifiedDriverWalkerControlMPITest test; + outputManager.resume(); + SECTION("Simple") + { + std::vector count_before{1, 1, 1}; + std::vector count_after{1, 1, 1}; + + // One walker on every node, should be no swapping + test.testMultiplicity(count_before, count_after); } -}; + SECTION("LoadBalance") + { + std::vector count_before{3, 1, 1}; + std::vector count_after{1, 2, 2}; + + test.testMultiplicity(count_before, count_after); + } } -TEST_CASE("MPI Walker Unified Driver swap walkers", "[drivers][walker_control]") +TEST_CASE("MPI WalkerControl population swap walkers", "[drivers][walker_control]") { + outputManager.pause(); testing::UnifiedDriverWalkerControlMPITest test; - test(); + outputManager.resume(); + + SECTION("Simple") + { + std::vector count_before{1, 1, 1}; + std::vector count_after{1, 1, 1}; + // One walker on every node, should be no swapping + test.testPopulationDiff(count_before, count_after); + } + + SECTION("LoadBalance") + { + std::vector count_before{3, 1, 1}; + std::vector count_after{1, 2, 2}; + test.testPopulationDiff(count_before, count_after); + } } } // namespace qmcplusplus diff --git a/src/QMCDrivers/tests/test_WalkerControlMPI.h b/src/QMCDrivers/tests/test_WalkerControlMPI.h new file mode 100644 index 0000000000..0e30b504c2 --- /dev/null +++ b/src/QMCDrivers/tests/test_WalkerControlMPI.h @@ -0,0 +1,41 @@ +////////////////////////////////////////////////////////////////////////////////////// +// This file is distributed under the University of Illinois/NCSA Open Source License. +// See LICENSE file in top directory for details. +// +// Copyright (c) 2019 QMCPACK developers. +// +// File developed by: Peter Doak, doakpw@ornl.gov, Oak Ridge National Laboratory +// +// File created by: Peter Doak, doakpw@ornl.gov, Oak Ridge National Laboratory +////////////////////////////////////////////////////////////////////////////////////// + +#ifndef QMCPLUSPLUS_TEST_WALKERCONTROLMPI_H +#define QMCPLUSPLUS_TEST_WALKERCONTROLMPI_H + +#include "QMCDrivers/DMC/WalkerControlMPI.h" +#include "QMCDrivers/tests/SetupPools.h" + +namespace qmcplusplus +{ +namespace testing +{ +/** Once there is only one driver type rename + */ +class UnifiedDriverWalkerControlMPITest +{ +public: + UnifiedDriverWalkerControlMPITest(); + void testMultiplicity(std::vector& rank_counts_expanded, std::vector& rank_counts_after); + void testPopulationDiff(std::vector& rank_counts_before, std::vector& rank_counts_after); + +private: + void reportWalkersPerRank(Communicate* c, MCPopulation& pop); + + SetupPools dpools_; + UPtr pop_; + WalkerControlMPI wc_; +}; +} // namespace testing +} // namespace qmcplusplus + +#endif /* QMCPLUSPLUSTEST_WALKERCONTROLMPI_H */ diff --git a/src/QMCHamiltonians/OperatorBase.cpp b/src/QMCHamiltonians/OperatorBase.cpp index b9561f60ea..5c3be4b741 100644 --- a/src/QMCHamiltonians/OperatorBase.cpp +++ b/src/QMCHamiltonians/OperatorBase.cpp @@ -42,7 +42,31 @@ OperatorBase::OperatorBase() : myIndex(-1), Dependants(0), Value(0.0), tWalker(0 */ void OperatorBase::mw_evaluate(const RefVector& O_list, const RefVector& P_list) { -#pragma omp parallel for + /** Temporary raw omp pragma for simple thread parallelism + * ignoring the driver level concurrency + * + * \todo replace this with a proper abstraction. It should adequately describe the behavior + * and strictly limit the activation of this level concurrency to when it is intended. + * It is unlikely to belong in this function. + * + * This implicitly depends on openmp work division logic. Essentially adhoc runtime + * crowds over which we have given up control of thread/global scope. + * How many walkers per thread? How to handle their data movement if any of these + * hamiltonians should be accelerated? We can neither reason about or describe it in C++ + * + * As I understand it it should only be required for as long as the AMD openmp offload + * compliler is incapable of running multiple threads. They should/must fix their compiler + * before delivery of frontier and it should be removed at that point at latest + * + * If you want 16 threads of 1 walker that should be 16 crowds of 1 + * not one crowd of 16 with openmp thrown in at hamiltonian level. + * If this must be different from the other crowd batching. Make this a reasoned about + * and controlled level of concurency blocking at the driver level. + * + * This is only thread safe only if each walker has a complete + * set of anything involved in an Operator.evaluate. + */ + #pragma omp parallel for for (int iw = 0; iw < O_list.size(); iw++) O_list[iw].get().evaluate(P_list[iw]); }