From b318dd95293142ea9e2c52df6c40dfd684f90dca Mon Sep 17 00:00:00 2001 From: mmorale3 Date: Sun, 6 Sep 2020 20:18:03 -0700 Subject: [PATCH 01/10] fixed issues with noncollinear in Walkers --- src/AFQMC/Walkers/WalkerIO.hpp | 13 ++++- src/AFQMC/Walkers/WalkerSetBase.h | 8 +-- src/AFQMC/Walkers/WalkerSetBase.icc | 5 ++ src/AFQMC/Walkers/tests/test_sharedwset.cpp | 65 +++++++++++++-------- 4 files changed, 60 insertions(+), 31 deletions(-) diff --git a/src/AFQMC/Walkers/WalkerIO.hpp b/src/AFQMC/Walkers/WalkerIO.hpp index 1132e7229f..6e1e5a64ff 100644 --- a/src/AFQMC/Walkers/WalkerIO.hpp +++ b/src/AFQMC/Walkers/WalkerIO.hpp @@ -201,6 +201,8 @@ bool restartFromHDF5(WalkerSet& wset, } TG.TG_local().broadcast_n(Idata.begin(), Idata.size()); + auto walker_type = wset.getWalkerType(); + int nWtot = Idata[0]; int wlk_nterms = Idata[2]; int NMO = Idata[4]; @@ -232,9 +234,10 @@ bool restartFromHDF5(WalkerSet& wset, int nw_local = nWN - nW0; { // to limit scope boost::multi::array PsiA, PsiB; + int NMO2 = ((walker_type == NONCOLLINEAR)?2*NMO:NMO); if (TG.TG_local().root()) { - PsiA.reextent({NMO, NAEA}); + PsiA.reextent({NMO2, NAEA}); if (wset.getWalkerType() == COLLINEAR) PsiB.reextent({NMO, NAEB}); } @@ -327,6 +330,8 @@ bool dumpToHDF5(WalkerSet& wset, hdf_archive& dump) NAEA = (*w.SlaterMatrix(Alpha)).size(1); if (walker_type == COLLINEAR) NAEB = (*w.SlaterMatrix(Beta)).size(1) + if (walker_type == NONCOLLINEAR) + NMO /= 2; } std::vector Idata(7); @@ -377,8 +382,10 @@ bool dumpToHDF5(WalkerSet& wset, hdf_archive& dump) auto w = wset[0]; NMO = (*w.SlaterMatrix(Alpha)).size(0); NAEA = (*w.SlaterMatrix(Alpha)).size(1); - if (walker_type == COLLINEAR) - NAEB = (*w.SlaterMatrix(Beta)).size(1); + if (walker_type == COLLINEAR) + NAEB = (*w.SlaterMatrix(Beta)).size(1); + if (walker_type == NONCOLLINEAR) + NMO /= 2; } std::vector Idata(7); diff --git a/src/AFQMC/Walkers/WalkerSetBase.h b/src/AFQMC/Walkers/WalkerSetBase.h index 7d36fc14f2..0f025a1df6 100644 --- a/src/AFQMC/Walkers/WalkerSetBase.h +++ b/src/AFQMC/Walkers/WalkerSetBase.h @@ -615,12 +615,10 @@ class WalkerSetBase : public AFQMCInfo int walkerSizeIO() { - if (walkerType == CLOSED) - return wlk_desc[0] * wlk_desc[1] + 7; - else if (walkerType == COLLINEAR) + if (walkerType == COLLINEAR) return wlk_desc[0] * (wlk_desc[1] + wlk_desc[2]) + 7; - else if (walkerType == NONCOLLINEAR) - return 2 * wlk_desc[0] * (wlk_desc[1] + wlk_desc[2]) + 7; + else // since NAEB = 0 in both CLOSED and NONCOLLINEAR cases + return wlk_desc[0] * wlk_desc[1] + 7; return 0; } diff --git a/src/AFQMC/Walkers/WalkerSetBase.icc b/src/AFQMC/Walkers/WalkerSetBase.icc index 22c84007be..70bdff1e35 100644 --- a/src/AFQMC/Walkers/WalkerSetBase.icc +++ b/src/AFQMC/Walkers/WalkerSetBase.icc @@ -58,6 +58,11 @@ void WalkerSetBase::parse(xmlNodePtr cur) app_log() << " Using a non-collinear (GHF) walker. \n"; walkerType = NONCOLLINEAR; } + else if (type.find("noncollinear") != std::string::npos) + { + app_log() << " Using a non-collinear (GHF) walker. \n"; + walkerType = NONCOLLINEAR; + } else if (type.find("collinear") != std::string::npos) { app_log() << " Using a collinear (UHF/ROHF) walker. \n"; diff --git a/src/AFQMC/Walkers/tests/test_sharedwset.cpp b/src/AFQMC/Walkers/tests/test_sharedwset.cpp index 75f0e3d270..c82e77dd20 100644 --- a/src/AFQMC/Walkers/tests/test_sharedwset.cpp +++ b/src/AFQMC/Walkers/tests/test_sharedwset.cpp @@ -78,7 +78,7 @@ void check(M1&& A, M2& B) using namespace afqmc; using communicator = boost::mpi3::communicator; -void test_basic_walker_features(bool serial) +void test_basic_walker_features(bool serial, std::string wtype) { auto world = boost::mpi3::environment::get_world_instance(); auto node = world.split_shared(world.rank()); @@ -92,6 +92,7 @@ void test_basic_walker_features(bool serial) //assert(world.size()%2 == 0); int NMO = 8, NAEA = 2, NAEB = 2, nwalkers = 10; + if(wtype=="noncollinear") { NAEA = 4; NAEB = 0; } //auto node = world.split_shared(); @@ -102,24 +103,24 @@ void test_basic_walker_features(bool serial) info.NAEA = NAEA; info.NAEB = NAEB; info.name = "walker"; - boost::multi::array initA({NMO, NAEA}); - boost::multi::array initB({NMO, NAEB}); - for (int i = 0; i < NAEA; i++) - initA[i][i] = Type(0.22); - for (int i = 0; i < NAEB; i++) - initB[i][i] = Type(0.22); + int M( (wtype=="noncollinear")?2*NMO:NMO ); + boost::multi::array initA({M,NAEA}); + boost::multi::array initB({M,NAEB}); + for(int i=0; i \ + std::string xml_block; + xml_block = " \ 0.05 \ 4 \ - closed \ + "+wtype+" \ async \ pair \ \ "; Libxml2Document doc; - bool okay = doc.parseFromString(xml_block); + bool okay = doc.parseFromString(xml_block.c_str()); REQUIRE(okay); WalkerSet wset(TG, doc.getRoot(), info, &rng); @@ -277,6 +278,9 @@ void test_hyperslab() REQUIRE(imag(DataIn[i][j]) == 0); } } + world.barrier(); + if (world.root()) + remove("dummy_walkers.h5"); } void test_double_hyperslab() @@ -354,9 +358,12 @@ void test_double_hyperslab() */ } } + world.barrier(); + if (world.root()) + remove("dummy_walkers.h5"); } -void test_walker_io() +void test_walker_io(std::string wtype) { auto world = boost::mpi3::environment::get_world_instance(); auto node = world.split_shared(world.rank()); @@ -370,6 +377,7 @@ void test_walker_io() //assert(world.size()%2 == 0); int NMO = 8, NAEA = 2, NAEB = 2, nwalkers = 10; + if(wtype=="noncollinear") { NAEA = 4; NAEB = 0; } //auto node = world.split_shared(); @@ -380,20 +388,20 @@ void test_walker_io() info.NAEA = NAEA; info.NAEB = NAEB; info.name = "walker"; - boost::multi::array initA({NMO, NAEA}); - boost::multi::array initB({NMO, NAEB}); - for (int i = 0; i < NAEA; i++) - initA[i][i] = Type(1.0); - for (int i = 0; i < NAEB; i++) - initB[i][i] = Type(1.0); + int M( (wtype=="noncollinear")?2*NMO:NMO ); + boost::multi::array initA({M,NAEA}); + boost::multi::array initB({M,NAEB}); + for(int i=0; i \ - closed \ + std::string xml_block; + xml_block = " \ + "+wtype+" \ \ "; Libxml2Document doc; - bool okay = doc.parseFromString(xml_block); + bool okay = doc.parseFromString(xml_block.c_str()); REQUIRE(okay); WalkerSet wset(TG, doc.getRoot(), info, &rng); @@ -467,12 +475,19 @@ void test_walker_io() REQUIRE(*wset[i].EJ() == *wset2[i].EJ()); } } + world.barrier(); + if (world.root()) + remove("dummy_walkers.h5"); } TEST_CASE("swset_test_serial", "[shared_wset]") { - test_basic_walker_features(true); - test_basic_walker_features(false); + test_basic_walker_features(true,"closed"); + test_basic_walker_features(false,"closed"); + test_basic_walker_features(true,"collinear"); + test_basic_walker_features(false,"collinear"); + test_basic_walker_features(true,"noncollinear"); + test_basic_walker_features(false,"noncollinear"); } /* TEST_CASE("hyperslab_tests", "[shared_wset]") @@ -481,6 +496,10 @@ TEST_CASE("hyperslab_tests", "[shared_wset]") test_double_hyperslab(); } */ -TEST_CASE("walker_io", "[shared_wset]") { test_walker_io(); } +TEST_CASE("walker_io", "[shared_wset]") { + test_walker_io("closed"); + test_walker_io("collinear"); + test_walker_io("noncollinear"); +} } // namespace qmcplusplus From d94f74be89cbbb456b14ec9b7648d33b170ac7f0 Mon Sep 17 00:00:00 2001 From: mmorale3 Date: Sun, 6 Sep 2020 21:14:19 -0700 Subject: [PATCH 02/10] changes to readWfn --- src/AFQMC/Utilities/readWfn.cpp | 48 ++++++++++++++++++++++----------- 1 file changed, 32 insertions(+), 16 deletions(-) diff --git a/src/AFQMC/Utilities/readWfn.cpp b/src/AFQMC/Utilities/readWfn.cpp index 14428fcfc1..466406b732 100644 --- a/src/AFQMC/Utilities/readWfn.cpp +++ b/src/AFQMC/Utilities/readWfn.cpp @@ -73,8 +73,6 @@ void read_header(std::ifstream& in, std::string& type, int& wfn_type, bool& full } case 2: { app_log() << "Reading a GHF-type trial wave-function. \n"; - app_error() << " GHF type not implemented. \n"; - APP_ABORT(" GHF type not implemented in readWfn.. \n"); break; } default: { @@ -353,15 +351,34 @@ void read_general_wavefunction(std::ifstream& in, APP_ABORT(" Error: Expecting Determinant: # tag in wavefunction file. \n"); read_mat(in, OrbMat, Cstyle, fullMOMat, NMO, NAEA); } - PsiT.emplace_back(csr::shm::construct_csr_matrix_single_input(OrbMat, 1e-8, 'H', comm)); - if (walker_type == COLLINEAR) - PsiT.emplace_back( - csr::shm::construct_csr_matrix_single_input(OrbMat(OrbMat.extension(0), {0, NAEB}), 1e-8, - 'H', comm)); + if(walker_type==CLOSED) { + PsiT.emplace_back(csr::shm::construct_csr_matrix_single_input( + OrbMat,1e-8,'H',comm)); + } else if(walker_type==COLLINEAR) { + PsiT.emplace_back(csr::shm::construct_csr_matrix_single_input( + OrbMat,1e-8,'H',comm)); + PsiT.emplace_back(csr::shm::construct_csr_matrix_single_input( + OrbMat(OrbMat.extension(0),{0,NAEB}), + 1e-8,'H',comm)); + } else if(walker_type==NONCOLLINEAR) { + APP_ABORT(" Error in readWfn: wfn_type==closed with walker_type=noncollinear.\n"); +/* + boost::multi::array Mat({2*NMO,NAEA+NAEB}); + if(comm.rank()==0) { + std::fill_n(Mat.origin(),2*NMO*(NAEA+NAEB),ComplexType(0.0)); + Mat({0,NMO},{0,NAEA}) = OrbMat; + Mat({NMO,2*NMO},{NAEA,NAEA+NAEB}) = OrbMat({0,NMO},{0,NAEB}); + } + PsiT.emplace_back(csr::shm::construct_csr_matrix_single_input( + OrbMat,1e-8,'H',comm)); +*/ + } } } else if (wfn_type == 1) { + if(walker_type != COLLINEAR) + APP_ABORT(" Error in readWfn: wfn_type==collinear with walker_type!=collinear.\n"); boost::multi::array OrbMat({NMO, NAEA}); for (int i = 0, q = 0; i < ndets; i++) { @@ -372,16 +389,21 @@ void read_general_wavefunction(std::ifstream& in, APP_ABORT(" Error: Expecting Determinant: # tag in wavefunction file. \n"); read_mat(in, OrbMat, Cstyle, fullMOMat, NMO, NAEA); } + PsiT.emplace_back(csr::shm::construct_csr_matrix_single_input(OrbMat, 1e-8, 'H', comm)); if (comm.rank() == 0) read_mat(in, OrbMat(OrbMat.extension(0), {0, NAEB}), Cstyle, fullMOMat, NMO, NAEB); PsiT.emplace_back( - csr::shm::construct_csr_matrix_single_input(OrbMat(OrbMat.extension(0), {0, NAEB}), 1e-8, 'H', + csr::shm::construct_csr_matrix_single_input(OrbMat(OrbMat.extension(0), {0, NAEB}), 1e-8, 'H', comm)); } } else if (wfn_type == 2) { + if(walker_type != NONCOLLINEAR) + APP_ABORT(" Error in readWfn: wfn_type==collinear with walker_type!=collinear.\n"); + if(NAEB != 0) + APP_ABORT(" Error in readWfn: walker_type==collinear with NAEB!=0.\n"); boost::multi::array OrbMat({2 * NMO, NAEA}); for (int i = 0, q = 0; i < ndets; i++) { @@ -393,13 +415,7 @@ void read_general_wavefunction(std::ifstream& in, read_mat(in, OrbMat, Cstyle, fullMOMat, 2 * NMO, NAEA); } PsiT.emplace_back(csr::shm::construct_csr_matrix_single_input(OrbMat, 1e-8, 'H', comm)); - if (comm.rank() == 0) - read_mat(in, OrbMat(OrbMat.extension(0), {0, NAEB}), Cstyle, fullMOMat, NMO, NAEB); - PsiT.emplace_back( - csr::shm::construct_csr_matrix_single_input(OrbMat(OrbMat.extension(0), {0, NAEB}), 1e-8, 'H', - comm)); } - } //type } else @@ -1020,8 +1036,8 @@ int readWfn(std::string fileName, boost::multi::array& OrbMat, i } else if (wfn_type == 2) { - APP_ABORT(" Error: wfn_type == 2 not implemented. \n"); - + OrbMat.reextent({1,2*NMO,NAEA+NAEB}); + read_mat(in,OrbMat[0],Cstyle,fullMOMat,2*NMO,NAEA+NAEB); } //type in.close(); From 5d8d446252a232090be35dd5c2c114e656d090d7 Mon Sep 17 00:00:00 2001 From: mmorale3 Date: Thu, 10 Sep 2020 10:52:38 -0700 Subject: [PATCH 03/10] working commit, making changes to KPFactorizedHamiltonian, not yet done --- .../tests/test_hamiltonian_operations.cpp | 30 ++++------ src/AFQMC/Hamiltonians/HamiltonianFactory.cpp | 8 +++ .../Hamiltonians/KPFactorizedHamiltonian.cpp | 58 ++++++++++--------- src/AFQMC/Utilities/kp_utilities.hpp | 46 ++++++++++----- 4 files changed, 81 insertions(+), 61 deletions(-) diff --git a/src/AFQMC/HamiltonianOperations/tests/test_hamiltonian_operations.cpp b/src/AFQMC/HamiltonianOperations/tests/test_hamiltonian_operations.cpp index d9be303656..554ab201f6 100644 --- a/src/AFQMC/HamiltonianOperations/tests/test_hamiltonian_operations.cpp +++ b/src/AFQMC/HamiltonianOperations/tests/test_hamiltonian_operations.cpp @@ -110,7 +110,6 @@ void ham_ops_basic_serial(boost::mpi3::communicator& world) Hamiltonian& ham = HamFac.getHamiltonian(gTG, ham_name); using CMatrix = ComplexMatrix; - boost::multi::array OrbMat({2, NMO, NAEA}); hdf_archive dump; if (!dump.open(UTEST_WFN, H5F_ACC_RDONLY)) { @@ -118,7 +117,6 @@ void ham_ops_basic_serial(boost::mpi3::communicator& world) } dump.push("Wavefunction", false); dump.push("NOMSD", false); - //int walker_type = readWfn(std::string(UTEST_WFN),OrbMat,NMO,NAEA,NAEB); std::vector dims(5); if (!dump.readEntry(dims, "dims")) { @@ -127,8 +125,8 @@ void ham_ops_basic_serial(boost::mpi3::communicator& world) } WALKER_TYPES WTYPE(initWALKER_TYPES(dims[3])); // int walker_type = dims[3]; - dump.push(std::string("PsiT_0")); - int NEL = (WTYPE == CLOSED) ? NAEA : (NAEA + NAEB); + int NEL = (WTYPE == CLOSED) ? NAEA : (NAEA + NAEB); + int NPOL = (WTYPE == NONCOLLINEAR) ? 2 : 1 ; // WALKER_TYPES WTYPE = CLOSED; // if(walker_type==1) WTYPE = COLLINEAR; // if(walker_type==2) WTYPE = NONCOLLINEAR; @@ -137,32 +135,28 @@ void ham_ops_basic_serial(boost::mpi3::communicator& world) Alloc alloc_(make_localTG_allocator(TG)); std::vector PsiT; PsiT.reserve(2); + dump.push(std::string("PsiT_0")); PsiT.emplace_back(csr_hdf5::HDF2CSR>(dump, gTG.Node())); - //PsiT.emplace_back(csr::shm::construct_csr_matrix_single_input( - //OrbMat[0],1e-8,'H',gTG.Node())); - if (WTYPE != CLOSED) + if (WTYPE == COLLINEAR) { dump.pop(); dump.push(std::string("PsiT_1")); PsiT.emplace_back(csr_hdf5::HDF2CSR>(dump, gTG.Node())); - //PsiT.emplace_back(csr::shm::construct_csr_matrix_single_input( - //OrbMat[1](OrbMat.extension(1),{0,NAEB}), - //1e-8,'H',gTG.Node())); - //} } dump.pop(); + boost::multi::array OrbMat({2, NPOL*NMO, NAEA}); { - boost::multi::array Psi0A({NMO, NAEA}); + boost::multi::array Psi0A({NPOL*NMO, NAEA}); dump.readEntry(Psi0A, "Psi0_alpha"); - for (int i = 0; i < NMO; i++) + for (int i = 0; i < NPOL*NMO; i++) { for (int j = 0; j < NAEA; j++) { OrbMat[0][i][j] = Psi0A[i][j]; } } - if (WTYPE != CLOSED) + if (WTYPE == COLLINEAR) { boost::multi::array Psi0B({NMO, NAEA}); dump.readEntry(Psi0B, "Psi0_beta"); @@ -183,16 +177,16 @@ void ham_ops_basic_serial(boost::mpi3::communicator& world) // NOTE: Make small factory routine! #if defined(ENABLE_CUDA) || defined(ENABLE_HIP) auto SDet( - SlaterDetOperations_serial(NMO, NAEA, + SlaterDetOperations_serial(NPOL*NMO, NAEA, afqmc::device_buffer_generator.get())); #else - auto SDet(SlaterDetOperations_shared(NMO, NAEA)); + auto SDet(SlaterDetOperations_shared(NPOL*NMO, NAEA)); #endif boost::multi::array devOrbMat(OrbMat, alloc_); std::vector devPsiT(move_vector(std::move(PsiT))); - CMatrix G({NEL, NMO}, alloc_); + CMatrix G({NEL, NPOL*NMO}, alloc_); ComplexType Ovlp = SDet.MixedDensityMatrix(devPsiT[0], devOrbMat[0], G.sliced(0, NAEA), 0.0, true); if (WTYPE == COLLINEAR) { @@ -203,7 +197,7 @@ void ham_ops_basic_serial(boost::mpi3::communicator& world) REQUIRE(imag(Ovlp) == Approx(0.0)); boost::multi::array Eloc({1, 3}, alloc_); - boost::multi::array_ref Gw(make_device_ptr(G.origin()), {1, NEL * NMO}); + boost::multi::array_ref Gw(make_device_ptr(G.origin()), {1, NEL * NPOL * NMO}); // This assumes {nwalk,...} which is not correct for some HamOps // make this generic HOps.energy(Eloc, Gw, 0, TG.getCoreID() == 0); diff --git a/src/AFQMC/Hamiltonians/HamiltonianFactory.cpp b/src/AFQMC/Hamiltonians/HamiltonianFactory.cpp index f7007003ed..afbe7dfab3 100644 --- a/src/AFQMC/Hamiltonians/HamiltonianFactory.cpp +++ b/src/AFQMC/Hamiltonians/HamiltonianFactory.cpp @@ -174,6 +174,14 @@ Hamiltonian HamiltonianFactory::fromHDF5(GlobalTaskGroup& gTG, xmlNodePtr cur) nkpts = Idata[2]; #endif + // MAM: this is wrong in NONCOLLINEAR, but how do I know what + // walker type it is right here??? + // Might need to read dimensions ahead of time from hdf5 file and check consistensy + // later + // Also, OneBodyHamiltonian doesn't make much sense now that you have KP classes. + // Consider refactoring this part of the code... + // It is not really used now, you can just read H1 in Sparse class too... + // 1 body hamiltonian: Why isn't this in shared memory!!! boost::multi::array H1({NMO, NMO}); diff --git a/src/AFQMC/Hamiltonians/KPFactorizedHamiltonian.cpp b/src/AFQMC/Hamiltonians/KPFactorizedHamiltonian.cpp index 54d8eb8ea1..362f441016 100644 --- a/src/AFQMC/Hamiltonians/KPFactorizedHamiltonian.cpp +++ b/src/AFQMC/Hamiltonians/KPFactorizedHamiltonian.cpp @@ -84,6 +84,7 @@ HamiltonianOperations KPFactorizedHamiltonian::getHamiltonianOperations_shared(b assert(PsiT.size() % 2 == 0); int nspins = ((type != COLLINEAR) ? 1 : 2); int ndet = PsiT.size() / nspins; + int npol = ((type == NONCOLLINEAR) ? 2 : 1); if (ndet > 1) APP_ABORT("Error: ndet > 1 not yet implemented in THCHamiltonian::getHamiltonianOperations.\n"); @@ -228,7 +229,7 @@ HamiltonianOperations KPFactorizedHamiltonian::getHamiltonianOperations_shared(b int nmo_max = *std::max_element(nmo_per_kp.begin(), nmo_per_kp.end()); int nchol_max = *std::max_element(nchol_per_kp.begin(), nchol_per_kp.end()); - shmCTensor H1({nkpts, nmo_max, nmo_max}, shared_allocator{TG.Node()}); + shmCTensor H1({nkpts, npol*nmo_max, npol*nmo_max}, shared_allocator{TG.Node()}); std::vector LQKikn; LQKikn.reserve(nkpts); for (int Q = 0; Q < nkpts; Q++) @@ -244,7 +245,7 @@ HamiltonianOperations KPFactorizedHamiltonian::getHamiltonianOperations_shared(b for (int Q = 0; Q < nkpts; Q++) { // until double_hyperslabs work! - boost::multi::array h1({nmo_per_kp[Q], nmo_per_kp[Q]}); + boost::multi::array h1({npol*nmo_per_kp[Q], npol*nmo_per_kp[Q]}); if (!dump.readEntry(h1, std::string("H1_kp") + std::to_string(Q))) { app_error() << " Error in KPFactorizedHamiltonian::getHamiltonianOperations():" @@ -253,7 +254,8 @@ HamiltonianOperations KPFactorizedHamiltonian::getHamiltonianOperations_shared(b } // H1[Q]({0,nmo_per_kp[Q]},{0,nmo_per_kp[Q]}) = h1; // using add to get raw pointer dispatch, otherwise matrix copy is going to sync - ma::add(ComplexType(1.0), h1, ComplexType(0.0), h1, H1[Q]({0, nmo_per_kp[Q]}, {0, nmo_per_kp[Q]})); + ma::add(ComplexType(1.0), h1, ComplexType(0.0), h1, + H1[Q]({0, npol*nmo_per_kp[Q]}, {0, npol*nmo_per_kp[Q]})); } // read LQ if (!dump.push("KPFactorized", false)) @@ -314,7 +316,7 @@ HamiltonianOperations KPFactorizedHamiltonian::getHamiltonianOperations_shared(b else { for (int i = 0; i < ndet; i++) - if (not get_nocc_per_kp(nmo_per_kp, PsiT[i], nocc_per_kp[i])) + if (not get_nocc_per_kp(nmo_per_kp, PsiT[i], nocc_per_kp[i],npol==2)) { app_error() << " Error in KPFactorizedHamiltonian::getHamiltonianOperations():" << " Only wavefunctions in block-diagonal form are accepted. \n"; @@ -337,7 +339,7 @@ HamiltonianOperations KPFactorizedHamiltonian::getHamiltonianOperations_shared(b */ std::vector LQKank; LQKank.reserve(ndet * nspins * nkpts); // storing 2 components for Q=0, since it is not assumed symmetric - shmCMatrix haj({ndet * nkpts, (type == COLLINEAR ? 2 : 1) * nocc_max * nmo_max}, + shmCMatrix haj({ndet * nkpts, (type == COLLINEAR ? 2 : 1) * nocc_max * npol * nmo_max}, shared_allocator{TG.Node()}); if (TG.Node().root()) std::fill_n(haj.origin(), haj.num_elements(), ComplexType(0.0)); @@ -464,16 +466,16 @@ HamiltonianOperations KPFactorizedHamiltonian::getHamiltonianOperations_shared(b } else { - auto Psi = get_PsiK>(nmo_per_kp, PsiT[nd], K); + auto Psi = get_PsiK>(nmo_per_kp, PsiT[nd], K, npol==2); assert(Psi.size(0) == na); - boost::multi::array_ref haj_r(to_address(haj[nd * nkpts + K].origin()), {na, ni}); + boost::multi::array_ref haj_r(to_address(haj[nd * nkpts + K].origin()), {na, npol*ni}); if (na > 0) - ma::product(ComplexType(2.0), Psi, H1[K]({0, ni}, {0, ni}), ComplexType(0.0), haj_r); + ma::product(ComplexType(2.0), Psi, H1[K]({0, npol*ni}, {0, npol*ni}), ComplexType(0.0), haj_r); } } } } - // Generate LQKank and haj + // Generate LQKank for (int nd = 0, nq0 = 0; nd < ndet; nd++, nq0 += nkpts * nspins) { for (int Q = 0; Q < nkpts; Q++) @@ -484,7 +486,7 @@ HamiltonianOperations KPFactorizedHamiltonian::getHamiltonianOperations_shared(b { if (nt % Qcomm.size() == Qcomm.rank()) { - // haj and add half-transformed right-handed rotation for Q=0 + // add half-transformed right-handed rotation for Q=0 int Qm = kminus[Q]; int QK = QKtok2[Q][K]; int na = nocc_per_kp[nd][K]; @@ -529,7 +531,7 @@ HamiltonianOperations KPFactorizedHamiltonian::getHamiltonianOperations_shared(b } else { - auto Psi = get_PsiK(nmo_per_kp, PsiT[nd], K); + auto Psi = get_PsiK(nmo_per_kp, PsiT[nd], K, npol==2); assert(Psi.size(0) == na); if (Q <= Qm) { @@ -586,7 +588,7 @@ HamiltonianOperations KPFactorizedHamiltonian::getHamiltonianOperations_shared(b } else { - auto PsiQK = get_PsiK(nmo_per_kp, PsiT[nd], QK); + auto PsiQK = get_PsiK(nmo_per_kp, PsiT[nd], QK, npol==2); assert(PsiQK.size(0) == na); Sp3Tensor_ref Lbnl(to_address(LQKbnl[nq0 + Qmap[Q] - 1][QK].origin()), {na, nchol, ni}); ma_rotate::getLank_from_Lkin(PsiQK, Likn, Lbnl, buff); @@ -717,8 +719,8 @@ HamiltonianOperations KPFactorizedHamiltonian::getHamiltonianOperations_shared(b SpMatrix Tabn({na, nb * nchol}); Sp3Tensor_ref T3abn(Tabn.origin(), {na, nb, nchol}); - auto Gal = get_PsiK>(nmo_per_kp, PsiT[0], Ka); - auto Gbk = get_PsiK>(nmo_per_kp, PsiT[0], Kb); + auto Gal = get_PsiK>(nmo_per_kp, PsiT[0], Ka, npol==2); + auto Gbk = get_PsiK>(nmo_per_kp, PsiT[0], Kb, npol==2); for (int a = 0; a < na; ++a) for (int l = 0; l < nl; ++l) Gal[a][l] = ma::conj(Gal[a][l]); @@ -810,6 +812,7 @@ HamiltonianOperations KPFactorizedHamiltonian::getHamiltonianOperations_batched( assert(PsiT.size() % 2 == 0); int nspins = ((type != COLLINEAR) ? 1 : 2); int ndet = PsiT.size() / nspins; + int npol = ((type == NONCOLLINEAR) ? 2 : 1); if (ndet > 1) APP_ABORT("Error: ndet > 1 not yet implemented in THCHamiltonian::getHamiltonianOperations.\n"); @@ -955,7 +958,7 @@ HamiltonianOperations KPFactorizedHamiltonian::getHamiltonianOperations_batched( int nmo_max = *std::max_element(nmo_per_kp.begin(), nmo_per_kp.end()); int nchol_max = *std::max_element(nchol_per_kp.begin(), nchol_per_kp.end()); - shmCTensor H1({nkpts, nmo_max, nmo_max}, shared_allocator{TG.Node()}); + shmCTensor H1({nkpts, npol*nmo_max, npol*nmo_max}, shared_allocator{TG.Node()}); std::vector LQKikn; LQKikn.reserve(nkpts); for (int Q = 0; Q < nkpts; Q++) @@ -971,7 +974,7 @@ HamiltonianOperations KPFactorizedHamiltonian::getHamiltonianOperations_batched( for (int Q = 0; Q < nkpts; Q++) { // until double_hyperslabs work! - boost::multi::array h1({nmo_per_kp[Q], nmo_per_kp[Q]}); + boost::multi::array h1({npol*nmo_per_kp[Q], npol*nmo_per_kp[Q]}); if (!dump.readEntry(h1, std::string("H1_kp") + std::to_string(Q))) { app_error() << " Error in KPFactorizedHamiltonian::getHamiltonianOperations():" @@ -979,7 +982,8 @@ HamiltonianOperations KPFactorizedHamiltonian::getHamiltonianOperations_batched( APP_ABORT(""); } //H1[Q]({0,nmo_per_kp[Q]},{0,nmo_per_kp[Q]}) = h1; - ma::add(ComplexType(1.0), h1, ComplexType(0.0), h1, H1[Q]({0, nmo_per_kp[Q]}, {0, nmo_per_kp[Q]})); + ma::add(ComplexType(1.0), h1, ComplexType(0.0), h1, + H1[Q]({0, npol*nmo_per_kp[Q]}, {0, npol*nmo_per_kp[Q]})); } } if (distNode.root()) @@ -1054,7 +1058,7 @@ HamiltonianOperations KPFactorizedHamiltonian::getHamiltonianOperations_batched( else { for (int i = 0; i < ndet; i++) - if (not get_nocc_per_kp(nmo_per_kp, PsiT[i], nocc_per_kp[i])) + if (not get_nocc_per_kp(nmo_per_kp, PsiT[i], nocc_per_kp[i],npol==2)) { app_error() << " Error in KPFactorizedHamiltonian::getHamiltonianOperations():" << " Only wavefunctions in block-diagonal form are accepted. \n"; @@ -1078,7 +1082,7 @@ HamiltonianOperations KPFactorizedHamiltonian::getHamiltonianOperations_batched( */ std::vector LQKank; LQKank.reserve(ndet * nspins * nkpts); // storing 2 components for Q=0, since it is not assumed symmetric - shmCMatrix haj({ndet * nkpts, (type == COLLINEAR ? 2 : 1) * nocc_max * nmo_max}, + shmCMatrix haj({ndet * nkpts, (type == COLLINEAR ? 2 : 1) * nocc_max * npol * nmo_max}, shared_allocator{TG.Node()}); if (TG.Node().root()) std::fill_n(to_address(haj.origin()), haj.num_elements(), ComplexType(0.0)); @@ -1224,11 +1228,11 @@ HamiltonianOperations KPFactorizedHamiltonian::getHamiltonianOperations_batched( } else { - auto Psi = get_PsiK>(nmo_per_kp, PsiT[nd], K); + auto Psi = get_PsiK>(nmo_per_kp, PsiT[nd], K, npol==2); assert(Psi.size(0) == na); - boost::multi::array_ref haj_r(to_address(haj[nd * nkpts + K].origin()), {nocc_max, nmo_max}); + boost::multi::array_ref haj_r(to_address(haj[nd * nkpts + K].origin()), {nocc_max, npol*nmo_max}); if (na > 0) - ma::product(ComplexType(2.0), Psi, H1[K]({0, ni}, {0, ni}), ComplexType(0.0), haj_r({0, na}, {0, ni})); + ma::product(ComplexType(2.0), Psi, H1[K]({0, npol*ni}, {0, npol*ni}), ComplexType(0.0), haj_r({0, na}, {0, npol*ni})); } } } @@ -1243,7 +1247,7 @@ HamiltonianOperations KPFactorizedHamiltonian::getHamiltonianOperations_batched( { if (nt % Qcomm.size() == Qcomm.rank()) { - // haj and add half-transformed right-handed rotation for Q=0 + // add half-transformed right-handed rotation for Q=0 int Qm = kminus[Q]; int QK = QKtok2[Q][K]; int na = nocc_per_kp[nd][K]; @@ -1291,7 +1295,7 @@ HamiltonianOperations KPFactorizedHamiltonian::getHamiltonianOperations_batched( } else { - auto Psi = get_PsiK(nmo_per_kp, PsiT[nd], K); + auto Psi = get_PsiK(nmo_per_kp, PsiT[nd], K, npol==2); assert(Psi.size(0) == na); if (Q <= Qm) { @@ -1350,7 +1354,7 @@ HamiltonianOperations KPFactorizedHamiltonian::getHamiltonianOperations_batched( } else { - auto PsiQK = get_PsiK(nmo_per_kp, PsiT[nd], QK); + auto PsiQK = get_PsiK(nmo_per_kp, PsiT[nd], QK, npol==2); assert(PsiQK.size(0) == na); Sp3Tensor_ref Lbnl(to_address(LQKbnl[nq0 + Qmap[Q] - 1][QK].origin()), {nocc_max, nchol_max, nmo_max}); Sp3Tensor_ref Lbln(to_address(LQKbln[nq0 + Qmap[Q] - 1][QK].origin()), {nocc_max, nmo_max, nchol_max}); @@ -1496,8 +1500,8 @@ HamiltonianOperations KPFactorizedHamiltonian::getHamiltonianOperations_batched( SpMatrix Tabn({na, nb * nchol_max}); Sp3Tensor_ref T3abn(Tabn.origin(), {na, nb, nchol_max}); - auto Gal = get_PsiK>(nmo_per_kp, PsiT[0], Ka); - auto Gbk = get_PsiK>(nmo_per_kp, PsiT[0], Kb); + auto Gal = get_PsiK>(nmo_per_kp, PsiT[0], Ka, npol==2); + auto Gbk = get_PsiK>(nmo_per_kp, PsiT[0], Kb, npol==2); for (int a = 0; a < na; ++a) for (int l = 0; l < nl; ++l) Gal[a][l] = ma::conj(Gal[a][l]); diff --git a/src/AFQMC/Utilities/kp_utilities.hpp b/src/AFQMC/Utilities/kp_utilities.hpp index 5c1eab4e69..7d7b64002d 100644 --- a/src/AFQMC/Utilities/kp_utilities.hpp +++ b/src/AFQMC/Utilities/kp_utilities.hpp @@ -23,18 +23,20 @@ * If it is block diagonal form, it also returns the number of states in each k-point block. */ template -bool get_nocc_per_kp(Vector const& nmo_per_kp, CSR const& PsiT, Array&& nocc_per_kp) +bool get_nocc_per_kp(Vector const& nmo_per_kp, CSR const& PsiT, Array&& nocc_per_kp, bool noncolin=false) { int nkpts = nmo_per_kp.size(); int N = PsiT.size(0); int M = PsiT.size(1); + int npol = noncolin ? 2 : 1; + assert( M%npol == 0 ); assert(nocc_per_kp.size() == nkpts); std::fill_n(to_address(nocc_per_kp.origin()), nkpts, 0); - std::vector bounds(nkpts + 1); + std::vector bounds(npol*nkpts + 1); bounds[0] = 0; - for (int k = 0; k < nkpts; k++) - bounds[k + 1] = bounds[k] + nmo_per_kp[k]; + for (int k = 0; k < npol*nkpts; k++) + bounds[k + 1] = bounds[k] + nmo_per_kp[k%nkpts]; int Q = 0; for (int i = 0; i < N; i++) { @@ -48,8 +50,10 @@ bool get_nocc_per_kp(Vector const& nmo_per_kp, CSR const& PsiT, Array&& nocc_per // check the kp index of the first non-zero column. Mut be either >= Q auto it = std::lower_bound(bounds.begin(), bounds.end(), *col + 1) - 1; assert(it != bounds.end()); - int Q_ = std::distance(bounds.begin(), it); + int Q_ = std::distance(bounds.begin(), it)%nkpts; + int pol_ = std::distance(bounds.begin(), it)/nkpts; assert(Q_ >= 0 && Q_ < nkpts); + assert( pol_ == 0 || pol_== 1 ); if (Q_ < Q) { std::fill_n(to_address(nocc_per_kp.origin()), nkpts, 0); @@ -58,7 +62,8 @@ bool get_nocc_per_kp(Vector const& nmo_per_kp, CSR const& PsiT, Array&& nocc_per Q = Q_; for (int ni = 0; ni < nt; ++ni, ++col) { - if (*col < bounds[Q] || *col >= bounds[Q + 1]) + if ( (*col < bounds[Q] || *col >= bounds[Q + 1]) && + (*col < bounds[(npol-1)*nkpts + Q] || *col >= bounds[(npol-1)*nkpts + Q + 1]) ) { std::fill_n(to_address(nocc_per_kp.origin()), nkpts, 0); return false; @@ -70,17 +75,19 @@ bool get_nocc_per_kp(Vector const& nmo_per_kp, CSR const& PsiT, Array&& nocc_per } template -Array get_PsiK(Vector const& nmo_per_kp, CSR const& PsiT, int K) +Array get_PsiK(Vector const& nmo_per_kp, CSR const& PsiT, int K, bool noncolin=false) { int nkpts = nmo_per_kp.size(); int N = PsiT.size(0); int M = PsiT.size(1); + int npol = noncolin ? 2 : 1; + assert( M%npol == 0 ); int nel = 0; - std::vector bounds(nkpts + 1); + std::vector bounds(npol*nkpts + 1); bounds[0] = 0; - for (int k = 0; k < nkpts; k++) - bounds[k + 1] = bounds[k] + nmo_per_kp[k]; + for (int k = 0; k < npol*nkpts; k++) + bounds[k + 1] = bounds[k] + nmo_per_kp[k%nkpts]; int Q = 0; for (int i = 0; i < N; i++) { @@ -91,19 +98,22 @@ Array get_PsiK(Vector const& nmo_per_kp, CSR const& PsiT, int K) // check the kp index of the first non-zero column. Mut be either >= Q auto it = std::lower_bound(bounds.begin(), bounds.end(), *col + 1) - 1; assert(it != bounds.end()); - int Q_ = std::distance(bounds.begin(), it); + int Q_ = std::distance(bounds.begin(), it)%nkpts; + int pol_ = std::distance(bounds.begin(), it)/nkpts; assert(Q_ >= 0 && Q_ < nkpts); + assert(pol_ == 0 || pol_ == 1); if (Q_ < Q) APP_ABORT("Error: PsiT not in block-diagonal form in get_PsiK.\n"); Q = Q_; for (int ni = 0; ni < nt; ++ni, ++col) - if (*col < bounds[Q] || *col >= bounds[Q + 1]) + if ( (*col < bounds[Q] || *col >= bounds[Q + 1]) && + (*col < bounds[(npol-1)*nkpts + Q] || *col >= bounds[(npol-1)*nkpts + Q + 1]) ) APP_ABORT("Error: PsiT not in block-diagonal form in get_PsiK.\n"); if (Q == K) nel++; } using element = typename std::decay::type::element; - Array A({nel, nmo_per_kp[K]}); + Array A({nel, npol*nmo_per_kp[K]}); using std::fill_n; fill_n(A.origin(), A.num_elements(), element(0)); nel = 0; @@ -114,11 +124,15 @@ Array get_PsiK(Vector const& nmo_per_kp, CSR const& PsiT, int K) auto val = PsiT.non_zero_values_data(i); // check the kp index of the first non-zero column. Mut be either >= Q auto it = std::lower_bound(bounds.begin(), bounds.end(), *col + 1) - 1; - int Q = std::distance(bounds.begin(), it); + int Q = std::distance(bounds.begin(), it)%nkpts; if (Q == K) { - for (int ni = 0; ni < nt; ++ni, ++col, ++val) - A[nel][*col - bounds[K]] = static_cast(*val); + for (int ni = 0; ni < nt; ++ni, ++col, ++val) { + if(*col < bounds[K+1]) // alpha + A[nel][*col - bounds[K]] = static_cast(*val); + else // beta + A[nel][*col - bounds[nkpts + K] + nmo_per_kp[K]] = static_cast(*val); + } nel++; } if (Q > K) From 67753f150dd3d496aab7ca7397934d69bdbf9e16 Mon Sep 17 00:00:00 2001 From: mmorale3 Date: Mon, 14 Sep 2020 17:58:06 -0700 Subject: [PATCH 04/10] fixed HamOps unit test to handle all types, fixed cholesky rotation routines and get_psiK to handle noncollinear, still not done with KP ham ops --- .../KP3IndexFactorization.hpp | 6 +- .../tests/test_hamiltonian_operations.cpp | 79 +++++++++------- .../Hamiltonians/KPFactorizedHamiltonian.cpp | 48 +++++----- .../SlaterDeterminantOperations/rotate.hpp | 90 ++++++++++++++----- 4 files changed, 141 insertions(+), 82 deletions(-) diff --git a/src/AFQMC/HamiltonianOperations/KP3IndexFactorization.hpp b/src/AFQMC/HamiltonianOperations/KP3IndexFactorization.hpp index 98061a7054..768da6b2b4 100644 --- a/src/AFQMC/HamiltonianOperations/KP3IndexFactorization.hpp +++ b/src/AFQMC/HamiltonianOperations/KP3IndexFactorization.hpp @@ -288,8 +288,7 @@ class KP3IndexFactorization int getKr = KEright != nullptr; int getKl = KEleft != nullptr; if (E.size(0) != nwalk || E.size(1) < 3) - APP_ABORT(" Error in AFQMC/HamiltonianOperations/sparse_matrix_energy::calculate_energy(). Incorrect matrix " - "dimensions \n"); + APP_ABORT(" Error in AFQMC/HamiltonianOperations/KP3IndexFactorization::energy(). Incorrect matrix dimensions \n"); size_t mem_needs(nwalk * nkpts * nkpts * nspin * nocca_max * nmo_max); size_t cnt(0); @@ -659,8 +658,7 @@ class KP3IndexFactorization int getKr = KEright != nullptr; int getKl = KEleft != nullptr; if (E.size(0) != nwalk || E.size(1) < 3) - APP_ABORT(" Error in AFQMC/HamiltonianOperations/sparse_matrix_energy::calculate_energy(). Incorrect matrix " - "dimensions \n"); + APP_ABORT(" Error in AFQMC/HamiltonianOperations/KP3IndexFactorization::energy(). Incorrect matrix dimensions\n"); size_t mem_needs(nwalk * nkpts * nkpts * nspin * nocca_max * nmo_max); size_t cnt(0); diff --git a/src/AFQMC/HamiltonianOperations/tests/test_hamiltonian_operations.cpp b/src/AFQMC/HamiltonianOperations/tests/test_hamiltonian_operations.cpp index 554ab201f6..63fa7c2251 100644 --- a/src/AFQMC/HamiltonianOperations/tests/test_hamiltonian_operations.cpp +++ b/src/AFQMC/HamiltonianOperations/tests/test_hamiltonian_operations.cpp @@ -197,10 +197,15 @@ void ham_ops_basic_serial(boost::mpi3::communicator& world) REQUIRE(imag(Ovlp) == Approx(0.0)); boost::multi::array Eloc({1, 3}, alloc_); - boost::multi::array_ref Gw(make_device_ptr(G.origin()), {1, NEL * NPOL * NMO}); - // This assumes {nwalk,...} which is not correct for some HamOps - // make this generic - HOps.energy(Eloc, Gw, 0, TG.getCoreID() == 0); + { + int nc=1,nr=NEL * NPOL * NMO; + if(HOps.transposed_G_for_E()) { + nr=1; + nc=NEL * NPOL * NMO; + } + boost::multi::array_ref Gw(make_device_ptr(G.origin()), {nr, nc}); + HOps.energy(Eloc, Gw, 0, TG.getCoreID() == 0); + } Eloc[0][0] = (TG.Node() += ComplexType(Eloc[0][0])); Eloc[0][1] = (TG.Node() += ComplexType(Eloc[0][1])); Eloc[0][2] = (TG.Node() += ComplexType(Eloc[0][2])); @@ -229,38 +234,46 @@ void ham_ops_basic_serial(boost::mpi3::communicator& world) auto nCV = HOps.local_number_of_cholesky_vectors(); CMatrix X({nCV, 1}, alloc_); - //HOps.vbias(Gw,X,sqrtdt); + { + int nc=1,nr=NEL * NPOL * NMO; + if(HOps.transposed_G_for_vbias()) { + nr=1; + nc=NEL * NPOL * NMO; + } + boost::multi::array_ref Gw(make_device_ptr(G.origin()), {nr, nc}); + HOps.vbias(Gw,X,sqrtdt); + } TG.local_barrier(); ComplexType Xsum = 0; - //for(int i=0; i1e-8) { - //REQUIRE( real(Xsum) == Approx(real(file_data.Xsum)) ); - //REQUIRE( imag(Xsum) == Approx(imag(file_data.Xsum)) ); - //} else { - //app_log()<<" Xsum: " <1e-8) { + REQUIRE( real(Xsum) == Approx(real(file_data.Xsum)) ); + REQUIRE( imag(Xsum) == Approx(imag(file_data.Xsum)) ); + } else { + app_log()<<" Xsum: " <1e-8) { - //REQUIRE( real(Vsum) == Approx(real(file_data.Vsum)) ); - //REQUIRE( imag(Vsum) == Approx(imag(file_data.Vsum)) ); - //} else { - //app_log()<<" Vsum: " <1e-8) { + REQUIRE( real(Vsum) == Approx(real(file_data.Vsum)) ); + REQUIRE( imag(Vsum) == Approx(imag(file_data.Vsum)) ); + } else { + app_log()<<" Vsum: " <= 0) - LQKank.emplace_back(shmSpMatrix({nkpts, ank_max}, shared_allocator{TG.Node()})); + LQKank.emplace_back(shmSpMatrix({nkpts, npol * ank_max}, shared_allocator{TG.Node()})); else LQKank.emplace_back(shmSpMatrix({1, 1}, shared_allocator{TG.Node()})); if (type == COLLINEAR) @@ -388,7 +388,7 @@ HamiltonianOperations KPFactorizedHamiltonian::getHamiltonianOperations_shared(b for (int nd = 0; nd < ndet; nd++) { for (int Q = 0; Q < number_of_symmetric_Q; Q++) - LQKbnl.emplace_back(shmSpMatrix({nkpts, ank_max}, shared_allocator{TG.Node()})); + LQKbnl.emplace_back(shmSpMatrix({nkpts, npol * ank_max}, shared_allocator{TG.Node()})); if (type == COLLINEAR) { for (int Q = 0; Q < number_of_symmetric_Q; Q++) @@ -435,7 +435,7 @@ HamiltonianOperations KPFactorizedHamiltonian::getHamiltonianOperations_shared(b if (Q0 < 0) APP_ABORT(" Error: Could not find Q=0. \n"); - boost::multi::array buff({nmo_max, nchol_max}); + boost::multi::array buff({npol*nmo_max, nchol_max}); int nt = 0; for (int nd = 0; nd < ndet; nd++) { @@ -536,14 +536,14 @@ HamiltonianOperations KPFactorizedHamiltonian::getHamiltonianOperations_shared(b if (Q <= Qm) { Sp3Tensor_ref Likn(to_address(LQKikn[Q][K].origin()), {ni, nk, nchol}); - Sp3Tensor_ref Lank(to_address(LQKank[nq0 + Q][K].origin()), {na, nchol, nk}); - ma_rotate::getLank(Psi, Likn, Lank, buff); + Sp3Tensor_ref Lank(to_address(LQKank[nq0 + Q][K].origin()), {na*npol, nchol, nk}); + ma_rotate::getLank(Psi, Likn, Lank, buff, npol==2); } else { Sp3Tensor_ref Lkin(to_address(LQKikn[Qm][QK].origin()), {nk, ni, nchol}); - Sp3Tensor_ref Lank(to_address(LQKank[nq0 + Q][K].origin()), {na, nchol, nk}); - ma_rotate::getLank_from_Lkin(Psi, Lkin, Lank, buff); + Sp3Tensor_ref Lank(to_address(LQKank[nq0 + Q][K].origin()), {na*npol, nchol, nk}); + ma_rotate::getLank_from_Lkin(Psi, Lkin, Lank, buff, npol==2); } } } @@ -590,8 +590,8 @@ HamiltonianOperations KPFactorizedHamiltonian::getHamiltonianOperations_shared(b { auto PsiQK = get_PsiK(nmo_per_kp, PsiT[nd], QK, npol==2); assert(PsiQK.size(0) == na); - Sp3Tensor_ref Lbnl(to_address(LQKbnl[nq0 + Qmap[Q] - 1][QK].origin()), {na, nchol, ni}); - ma_rotate::getLank_from_Lkin(PsiQK, Likn, Lbnl, buff); + Sp3Tensor_ref Lbnl(to_address(LQKbnl[nq0 + Qmap[Q] - 1][QK].origin()), {na*npol, nchol, ni}); + ma_rotate::getLank_from_Lkin(PsiQK, Likn, Lbnl, buff, npol==2); } } } @@ -683,6 +683,9 @@ HamiltonianOperations KPFactorizedHamiltonian::getHamiltonianOperations_shared(b app_log() << " Sampling EXX energy using distribution over Q vector obtained from " << " trial energy. \n"; + if(npol == 2) + APP_ABORT("Error: nsampleQ>0 not yet implemented for noncollinear.\n\n\n"); + RealType scl = (type == CLOSED ? 2.0 : 1.0); size_t nqk = 0; for (int Q = 0; Q < nkpts; ++Q) @@ -1091,7 +1094,7 @@ HamiltonianOperations KPFactorizedHamiltonian::getHamiltonianOperations_batched( { for (int Q = 0; Q < nkpts; Q++) if (Qmap[Q] >= 0) - LQKank.emplace_back(shmSpMatrix({nkpts, ank_max}, shared_allocator{distNode})); + LQKank.emplace_back(shmSpMatrix({nkpts, npol * ank_max}, shared_allocator{distNode})); else LQKank.emplace_back(shmSpMatrix({1, 1}, shared_allocator{distNode})); if (type == COLLINEAR) @@ -1114,7 +1117,7 @@ HamiltonianOperations KPFactorizedHamiltonian::getHamiltonianOperations_batched( for (int Q = 0; Q < nkpts; Q++) { if (Qmap[Q] >= 0) - LQKakn.emplace_back(shmSpMatrix({nkpts, ank_max}, shared_allocator{distNode})); + LQKakn.emplace_back(shmSpMatrix({nkpts, npol * ank_max}, shared_allocator{distNode})); else LQKakn.emplace_back(shmSpMatrix({1, 1}, shared_allocator{distNode})); } @@ -1194,7 +1197,7 @@ HamiltonianOperations KPFactorizedHamiltonian::getHamiltonianOperations_batched( APP_ABORT(" Error: Could not find Q=0. \n"); TG.Node().barrier(); - boost::multi::array buff({nmo_max, nchol_max}); + boost::multi::array buff({npol*nmo_max, nchol_max}); int nt = 0; for (int nd = 0; nd < ndet; nd++) { @@ -1300,16 +1303,16 @@ HamiltonianOperations KPFactorizedHamiltonian::getHamiltonianOperations_batched( if (Q <= Qm) { Sp3Tensor_ref Likn(to_address(LQKikn[Q][K].origin()), {nmo_max, nmo_max, nchol_max}); - Sp3Tensor_ref Lakn(to_address(LQKakn[nq0 + Q][K].origin()), {nocc_max, nmo_max, nchol_max}); - Sp3Tensor_ref Lank(to_address(LQKank[nq0 + Q][K].origin()), {nocc_max, nchol_max, nmo_max}); - ma_rotate_padded::getLakn_Lank(Psi, Likn, Lakn, Lank); + Sp3Tensor_ref Lakn(to_address(LQKakn[nq0 + Q][K].origin()), {npol*nocc_max, nmo_max, nchol_max}); + Sp3Tensor_ref Lank(to_address(LQKank[nq0 + Q][K].origin()), {npol*nocc_max, nchol_max, nmo_max}); + ma_rotate_padded::getLakn_Lank(Psi, Likn, Lakn, Lank, npol==2); } else { Sp3Tensor_ref Lkin(to_address(LQKikn[Qm][QK].origin()), {nmo_max, nmo_max, nchol_max}); - Sp3Tensor_ref Lakn(to_address(LQKakn[nq0 + Q][K].origin()), {nocc_max, nmo_max, nchol_max}); - Sp3Tensor_ref Lank(to_address(LQKank[nq0 + Q][K].origin()), {nocc_max, nchol_max, nmo_max}); - ma_rotate_padded::getLakn_Lank_from_Lkin(Psi, Lkin, Lakn, Lank, buff); + Sp3Tensor_ref Lakn(to_address(LQKakn[nq0 + Q][K].origin()), {npol*nocc_max, nmo_max, nchol_max}); + Sp3Tensor_ref Lank(to_address(LQKank[nq0 + Q][K].origin()), {npol*nocc_max, nchol_max, nmo_max}); + ma_rotate_padded::getLakn_Lank_from_Lkin(Psi, Lkin, Lakn, Lank, buff, npol==2); } } } @@ -1356,9 +1359,9 @@ HamiltonianOperations KPFactorizedHamiltonian::getHamiltonianOperations_batched( { auto PsiQK = get_PsiK(nmo_per_kp, PsiT[nd], QK, npol==2); assert(PsiQK.size(0) == na); - Sp3Tensor_ref Lbnl(to_address(LQKbnl[nq0 + Qmap[Q] - 1][QK].origin()), {nocc_max, nchol_max, nmo_max}); - Sp3Tensor_ref Lbln(to_address(LQKbln[nq0 + Qmap[Q] - 1][QK].origin()), {nocc_max, nmo_max, nchol_max}); - ma_rotate_padded::getLakn_Lank_from_Lkin(PsiQK, Likn, Lbln, Lbnl, buff); + Sp3Tensor_ref Lbnl(to_address(LQKbnl[nq0 + Qmap[Q] - 1][QK].origin()), {npol*nocc_max, nchol_max, nmo_max}); + Sp3Tensor_ref Lbln(to_address(LQKbln[nq0 + Qmap[Q] - 1][QK].origin()), {npol*nocc_max, nmo_max, nchol_max}); + ma_rotate_padded::getLakn_Lank_from_Lkin(PsiQK, Likn, Lbln, Lbnl, buff, npol==2); } } } @@ -1464,6 +1467,9 @@ HamiltonianOperations KPFactorizedHamiltonian::getHamiltonianOperations_batched( app_log() << " Sampling EXX energy using distribution over Q vector obtained from " << " trial energy. \n"; + if(npol == 2) + APP_ABORT("Error: nsampleQ>0 not yet implemented for noncollinear.\n\n\n"); + RealType scl = (type == CLOSED ? 2.0 : 1.0); size_t nqk = 0; for (int Q = 0; Q < nkpts; ++Q) diff --git a/src/AFQMC/SlaterDeterminantOperations/rotate.hpp b/src/AFQMC/SlaterDeterminantOperations/rotate.hpp index 311202a3d6..86c98c02b8 100644 --- a/src/AFQMC/SlaterDeterminantOperations/rotate.hpp +++ b/src/AFQMC/SlaterDeterminantOperations/rotate.hpp @@ -612,28 +612,41 @@ void halfRotateCholeskyMatrix(WALKER_TYPES type, } // design for compact arrays +/* + * Constructs the following contraction of the Slater matrix of the trial wfn and the Cholesky Matrix (Likn). + * Case: + * - Closed/Collinear: L[a][n][k] = sum_i A[a][i] L[i][k][n] + * - In collinear case, two separate calls are made for each spin channel. + * - Non-collinear: L[a][s][n][k] = sum_i A[a][si] L[i][k][n] // [si] == [s][i] combined spinor index + * - In this case, to preserve matrix dimenions, [a][s] --> [as] is kept as a single index, peculiar order is given by its use in Energy evaluation. + */ template -void getLank(MultiArray2DA&& Aai, MultiArray3DB&& Likn, MultiArray3DC&& Lank, MultiArray2D&& buff) +void getLank(MultiArray2DA&& Aai, MultiArray3DB&& Likn, MultiArray3DC&& Lank, MultiArray2D&& buff, bool noncollinear=false) { + int npol = noncollinear ? 2 : 1; int na = Aai.size(0); if (na == 0) return; - int ni = Aai.size(1); + int ni = Aai.size(1) / npol; int nk = Likn.size(1); int nchol = Likn.size(2); assert(Likn.size(0) == ni); - assert(Lank.size(0) == na); + assert(Lank.size(0) == na*npol); assert(Lank.size(1) == nchol); assert(Lank.size(2) == nk); assert(buff.size(0) >= nk); assert(buff.size(1) >= nchol); + if(noncollinear) + assert(Aai.stride(0) == Aai.size(1)); // make sure it is contiguous + using elementA = typename std::decay::type::element; using element = typename std::decay::type::element; + boost::multi::array_ref Aas_i(to_address(Aai.origin()), {na * npol, ni}); boost::multi::array_ref Li_kn(to_address(Likn.origin()), {ni, nk * nchol}); - boost::multi::array_ref La_kn(to_address(Lank.origin()), {na, nk * nchol}); + boost::multi::array_ref Las_kn(to_address(Lank.origin()), {na * npol, nk * nchol}); - ma::product(Aai, Li_kn, La_kn); - for (int a = 0; a < na; a++) + ma::product(Aas_i, Li_kn, Las_kn); + for (int a = 0; a < na * npol; a++) { boost::multi::array_ref Lkn(to_address(Lank[a].origin()), {nk, nchol}); boost::multi::array_ref Lnk(to_address(Lank[a].origin()), {nchol, nk}); @@ -642,28 +655,42 @@ void getLank(MultiArray2DA&& Aai, MultiArray3DB&& Likn, MultiArray3DC&& Lank, Mu } } +/* + * Constructs the following contraction of the Slater matrix of the trial wfn and the Cholesky Matrix (Likn). + * Case: + * - Closed/Collinear: L[a][n][k] = sum_i A[a][i] conj(L[k][i][n]) + * - In collinear case, two separate calls are made for each spin channel. + * - Non-collinear: L[a][s][n][k] = sum_i A[a][si] conj(L[k][i][n]) // [si] == [s][i] combined spinor index + * - In this case, to preserve matrix dimenions, [a][s] --> [as] is kept as a single index, peculiar order is given by its use in Energy evaluation. + */ template -void getLank_from_Lkin(MultiArray2DA&& Aai, MultiArray3DB&& Lkin, MultiArray3DC&& Lank, MultiArray2D&& buff) +void getLank_from_Lkin(MultiArray2DA&& Aai, MultiArray3DB&& Lkin, MultiArray3DC&& Lank, MultiArray2D&& buff, bool noncollinear=false) { + int npol = noncollinear ? 2 : 1; int na = Aai.size(0); if (na == 0) return; - int ni = Aai.size(1); + int ni = Aai.size(1) / npol; int nk = Lkin.size(0); int nchol = Lkin.size(2); assert(Lkin.size(1) == ni); - assert(Lank.size(0) == na); + assert(Lank.size(0) == na * npol); assert(Lank.size(1) == nchol); assert(Lank.size(2) == nk); - assert(buff.num_elements() >= na * nchol); + assert(buff.num_elements() >= na * npol * nchol); + if(noncollinear) + assert(Aai.stride(0) == Aai.size(1)); // make sure it is contiguous using Type = typename std::decay::type::element; - boost::multi::array_ref bna(to_address(buff.origin()), {nchol, na}); + using elementA = typename std::decay::type::element; + boost::multi::array_ref Aas_i(to_address(Aai.origin()), {na * npol, ni}); + boost::multi::array_ref bna(to_address(buff.origin()), {nchol, na * npol}); // Lank[a][n][k] = sum_i Aai[a][i] conj(Lkin[k][i][n]) + // Lank[as][n][k] = sum_i Aai[as][i] conj(Lkin[k][i][n]) for (int k = 0; k < nk; k++) { - ma::product(ma::H(Lkin[k]), ma::T(Aai), bna); - for (int a = 0; a < na; a++) + ma::product(ma::H(Lkin[k]), ma::T(Aas_i), bna); + for (int a = 0; a < na * npol; a++) for (int n = 0; n < nchol; n++) Lank[a][n][k] = bna[n][a]; } @@ -675,12 +702,13 @@ namespace ma_rotate_padded { // designed for padded arrays template -void getLakn_Lank(MultiArray2DA&& Aai, MultiArray3DB&& Likn, MultiArray3DC&& Lakn, MultiArray3DC&& Lank) +void getLakn_Lank(MultiArray2DA&& Aai, MultiArray3DB&& Likn, MultiArray3DC&& Lakn, MultiArray3DC&& Lank, bool noncollinear=false) { + int npol = noncollinear ? 2 : 1; int na = Aai.size(0); if (na == 0) return; - int ni = Aai.size(1); + int ni = Aai.size(1) / npol; int nmo = Likn.size(0); int nchol = Likn.size(2); @@ -689,17 +717,23 @@ void getLakn_Lank(MultiArray2DA&& Aai, MultiArray3DB&& Likn, MultiArray3DC&& Lak assert(Lakn.size(1) == nmo); assert(Lakn.size(2) == nchol); + assert(Lakn.size(0) >= npol*na ); assert(Lakn.size(0) == Lank.size(0)); assert(Lank.size(1) == nchol); assert(Lank.size(2) == nmo); + if(noncollinear) + assert(Aai.stride(0) == Aai.size(1)); // make sure it is contiguous + + using elmA = typename std::decay::type::element; using elmB = typename std::decay::type::element; using elmC = typename std::decay::type::element; + boost::multi::array_ref Aas_i(to_address(Aai.origin()), {na * npol, ni}); boost::multi::array_ref Li_kn(Likn.origin(), {ni, nmo * nchol}); - boost::multi::array_ref La_kn(Lakn.origin(), {na, nmo * nchol}); + boost::multi::array_ref Las_kn(Lakn.origin(), {na * npol, nmo * nchol}); - ma::product(Aai, Li_kn, La_kn); + ma::product(Aas_i, Li_kn, Las_kn); for (int a = 0; a < na; a++) ma::transpose(Lakn[a], Lank[a]); } @@ -709,12 +743,14 @@ void getLakn_Lank_from_Lkin(MultiArray2DA&& Aai, MultiArray3DB&& Lkin, MultiArray3DC&& Lakn, MultiArray3DC&& Lank, - MultiArray2D&& buff) + MultiArray2D&& buff, + bool noncollinear=false) { + int npol = noncollinear ? 2 : 1; int na = Aai.size(0); if (na == 0) return; - int ni = Aai.size(1); + int ni = Aai.size(1) / npol; int nmo = Lkin.size(0); int nchol = Lkin.size(2); @@ -723,24 +759,30 @@ void getLakn_Lank_from_Lkin(MultiArray2DA&& Aai, assert(Lakn.size(1) == nmo); assert(Lakn.size(2) == nchol); + assert(Lakn.size(0) >= npol*na ); assert(Lakn.size(0) == Lank.size(0)); assert(Lank.size(1) == nchol); assert(Lank.size(2) == nmo); - assert(buff.num_elements() >= na * nchol); + if(noncollinear) + assert(Aai.stride(0) == Aai.size(1)); // make sure it is contiguous + + assert(buff.num_elements() >= na * npol * nchol); using ptr2 = typename std::decay::type::element_ptr; using elm2 = typename std::decay::type::element; + using elmA = typename std::decay::type::element; - boost::multi::array_ref bna(buff.origin(), {nchol, na}); - // Lakn[a][k][n] = sum_i Aai[a][i] conj(Lkin[k][i][n]) + boost::multi::array_ref Aas_i(to_address(Aai.origin()), {na * npol, ni}); + boost::multi::array_ref bna(buff.origin(), {nchol, na * npol}); + // Lakn[as][k][n] = sum_i Aai[as][i] conj(Lkin[k][i][n]) for (int k = 0; k < nmo; k++) { ma::product(ma::H(Lkin[k].sliced(0, ni)), ma::T(Aai), bna); - for (int a = 0; a < na; a++) + for (int a = 0; a < na * npol; a++) Lakn[a][k] = bna({0, nchol}, a); } - for (int a = 0; a < na; a++) + for (int a = 0; a < na * npol; a++) ma::transpose(Lakn[a], Lank[a]); } From 8cc9a130bdb30815766b2a0de4bd2bf7f82a33c0 Mon Sep 17 00:00:00 2001 From: mmorale3 Date: Tue, 15 Sep 2020 12:33:27 -0700 Subject: [PATCH 05/10] noncollinear changes to SlaterDetOps and Wfn, unit tests in both pass on cpu --- .../KP3IndexFactorization.hpp | 127 +++++++++++------- .../tests/test_hamiltonian_operations.cpp | 7 +- .../Hamiltonians/KPFactorizedHamiltonian.cpp | 24 ++-- .../SlaterDetOperations_base.hpp | 23 +++- .../SlaterDetOperations_serial.hpp | 42 ++++-- .../SlaterDetOperations_shared.hpp | 23 +++- .../apply_expM.hpp | 11 +- .../SlaterDeterminantOperations/rotate.hpp | 62 ++++----- src/AFQMC/Wavefunctions/NOMSD.hpp | 4 +- .../Wavefunctions/WavefunctionFactory.cpp | 48 +++---- .../Wavefunctions/tests/test_wfn_factory.cpp | 10 +- 11 files changed, 233 insertions(+), 148 deletions(-) diff --git a/src/AFQMC/HamiltonianOperations/KP3IndexFactorization.hpp b/src/AFQMC/HamiltonianOperations/KP3IndexFactorization.hpp index 768da6b2b4..3f95758b0b 100644 --- a/src/AFQMC/HamiltonianOperations/KP3IndexFactorization.hpp +++ b/src/AFQMC/HamiltonianOperations/KP3IndexFactorization.hpp @@ -277,6 +277,7 @@ class KP3IndexFactorization int nwalk = Gc.size(1); int nspin = (walker_type == COLLINEAR ? 2 : 1); + int npol = (walker_type == NONCOLLINEAR ? 2 : 1); int nmo_tot = std::accumulate(nopk.begin(), nopk.end(), 0); int nmo_max = *std::max_element(nopk.begin(), nopk.end()); int nocca_tot = std::accumulate(nelpk[nd].begin(), nelpk[nd].begin() + nkpts, 0); @@ -290,7 +291,7 @@ class KP3IndexFactorization if (E.size(0) != nwalk || E.size(1) < 3) APP_ABORT(" Error in AFQMC/HamiltonianOperations/KP3IndexFactorization::energy(). Incorrect matrix dimensions \n"); - size_t mem_needs(nwalk * nkpts * nkpts * nspin * nocca_max * nmo_max); + size_t mem_needs(nwalk * nkpts * nkpts * nspin * nocca_max * nmo_max * npol); size_t cnt(0); if (addEJ) { @@ -366,35 +367,35 @@ class KP3IndexFactorization for (int n = 0; n < nwalk; n++) std::fill_n(E[n].origin(), 3, ComplexType(0.)); - assert(Gc.num_elements() == nwalk * (nocca_tot + noccb_tot) * nmo_tot); - boost::multi::array_cref G3Da(to_address(Gc.origin()), {nocca_tot, nmo_tot, nwalk}); + assert(Gc.num_elements() == nwalk * (nocca_tot + noccb_tot) * npol * nmo_tot); + boost::multi::array_cref G3Da(to_address(Gc.origin()), {nocca_tot * npol, nmo_tot, nwalk}); boost::multi::array_cref G3Db(to_address(Gc.origin()) + G3Da.num_elements() * (nspin - 1), {noccb_tot, nmo_tot, nwalk}); // with yet another mapping, it is possible to reduce the memory usage here! // avoiding for now! - Sp4Tensor_ref GKK(to_address(SM_TMats.origin()) + cnt, {nspin, nkpts, nkpts, nwalk * nmo_max * nocca_max}); + Sp4Tensor_ref GKK(to_address(SM_TMats.origin()) + cnt, {nspin, nkpts, nkpts, nwalk * npol * nmo_max * nocca_max}); GKaKjw_to_GKKwaj(nd, Gc, GKK, nocca_tot, noccb_tot, nmo_tot, nmo_max * nocca_max); comm->barrier(); // one-body contribution // haj[ndet*nkpts][nocc*nmo] - // not parallelized for now, since it would require customization of Wfn if (addH1) { - // must use Gc since GKK is is SP int na = 0, nk = 0, nb = 0; for (int n = 0; n < nwalk; n++) E[n][0] = E0; for (int K = 0; K < nkpts; ++K) { #ifdef MIXED_PRECISION - boost::multi::array_ref haj_K(to_address(haj[nd * nkpts + K].origin()), - {nelpk[nd][K], nopk[K]}); + // must use Gc since GKK is is SP + boost::multi::array_ref haj_K(to_address(haj[nd * nkpts + K].origin()), + {nelpk[nd][K], npol, nopk[K]}); for (int a = 0; a < nelpk[nd][K]; ++a) - ma::product(ComplexType(1.), ma::T(G3Da[na + a].sliced(nk, nk + nopk[K])), haj_K[a], ComplexType(1.), - E(E.extension(0), 0)); + for (int pol = 0; pol < npol; ++pol) + ma::product(ComplexType(1.), ma::T(G3Da[(na + a)*npol+pol].sliced(nk, nk + nopk[K])), + haj_K[a][pol], ComplexType(1.), E(E.extension(0), 0)); na += nelpk[nd][K]; if (walker_type == COLLINEAR) { @@ -407,11 +408,12 @@ class KP3IndexFactorization } nk += nopk[K]; #else + // use GKK nk = nopk[K]; { na = nelpk[nd][K]; - CVector_ref haj_K(to_address(haj[nd * nkpts + K].origin()), {na * nk}); - SpMatrix_ref Gaj(to_address(GKK[0][K][K].origin()), {nwalk, na * nk}); + CVector_ref haj_K(to_address(haj[nd * nkpts + K].origin()), {na * npol * nk}); + SpMatrix_ref Gaj(to_address(GKK[0][K][K].origin()), {nwalk, na * npol * nk}); ma::product(ComplexType(1.), Gaj, haj_K, ComplexType(1.), E(E.extension(0), 0)); } if (walker_type == COLLINEAR) @@ -425,12 +427,10 @@ class KP3IndexFactorization } } - // move calculation of H1 here - // NOTE: For CLOSED/NONCOLLINEAR, can do all walkers simultaneously to improve perf. of GEMM - // Not sure how to do it for COLLINEAR. if (addEXX) { - size_t local_memory_needs = 2 * nwalk * nocca_max * nocca_max * nchol_max + 2 * nchol_max * nwalk; + size_t local_memory_needs = 2 * nwalk * nocca_max * nocca_max * nchol_max + + 2 * nchol_max * nwalk; if (TMats.num_elements() < local_memory_needs) TMats.reextent({local_memory_needs, 1}); cnt = 0; @@ -463,18 +463,21 @@ class KP3IndexFactorization int na = nelpk[nd][Ka]; int nk = nopk[Kk]; - SpMatrix_ref Gwal(GKK[0][Ka][Kl].origin(), {nwalk * na, nl}); - SpMatrix_ref Gwbk(GKK[0][Kb][Kk].origin(), {nwalk * nb, nk}); - SpMatrix_ref Lank(to_address(LQKank[nd * nspin * nkpts + Q][Ka].origin()), {na * nchol, nk}); + SpMatrix_ref Gwal(GKK[0][Ka][Kl].origin(), {nwalk * na , npol * nl}); + SpMatrix_ref Gwbk(GKK[0][Kb][Kk].origin(), {nwalk * nb , npol * nk}); + SpMatrix_ref Lank(to_address(LQKank[nd * nspin * nkpts + Q][Ka].origin()), + {na * nchol, npol * nk}); auto bnl_ptr(to_address(LQKank[nd * nspin * nkpts + Qm][Kb].origin())); if (Qmap[Q] > 0) bnl_ptr = to_address(LQKbnl[nd * nspin * number_of_symmetric_Q + Qmap[Q] - 1][Kb].origin()); - SpMatrix_ref Lbnl(bnl_ptr, {nb * nchol, nl}); + SpMatrix_ref Lbnl(bnl_ptr, {nb * nchol, npol * nl}); SpMatrix_ref Twban(TMats.origin() + cnt, {nwalk * nb, na * nchol}); Sp4Tensor_ref T4Dwban(TMats.origin() + cnt, {nwalk, nb, na, nchol}); - SpMatrix_ref Twabn(Twban.origin() + Twban.num_elements(), {nwalk * na, nb * nchol}); - Sp4Tensor_ref T4Dwabn(Twban.origin() + Twban.num_elements(), {nwalk, na, nb, nchol}); + SpMatrix_ref Twabn(Twban.origin() + Twban.num_elements(), + {nwalk * na, nb * nchol}); + Sp4Tensor_ref T4Dwabn(Twban.origin() + Twban.num_elements(), + {nwalk, na, nb, nchol}); if (na > 0 && nb > 0) ma::product(Gwal, ma::T(Lbnl), Twabn); @@ -1280,6 +1283,7 @@ class KP3IndexFactorization typename = typename std::enable_if_t<(std::decay::type::dimensionality == 2)>> void vbias(const MatA& Gw, MatB&& v, double a = 1., double c = 0., int nd = 0) { + using std::copy_n; using GType = typename std::decay_t; using vType = typename std::decay::type::element; int nkpts = nopk.size(); @@ -1288,6 +1292,7 @@ class KP3IndexFactorization assert(v.size(0) == 2 * local_nCV); assert(v.size(1) == nwalk); int nspin = (walker_type == COLLINEAR ? 2 : 1); + int npol = (walker_type == NONCOLLINEAR ? 2 : 1); int nmo_tot = std::accumulate(nopk.begin(), nopk.end(), 0); int nmo_max = *std::max_element(nopk.begin(), nopk.end()); int nocca_tot = std::accumulate(nelpk[nd].begin(), nelpk[nd].begin() + nkpts, 0); @@ -1306,12 +1311,14 @@ class KP3IndexFactorization SPComplexType minusimhalfa(0.0, -0.5 * a * scl); SPComplexType imhalfa(0.0, 0.5 * a * scl); size_t local_memory_needs = 2 * nchol_max * nwalk; + if(walker_type == NONCOLLINEAR) + local_memory_needs += nmo_max * npol * nwalk; // for transposed G if (TMats.num_elements() < local_memory_needs) TMats.reextent({local_memory_needs, 1}); SpMatrix_ref vlocal(TMats.origin(), {2 * nchol_max, nwalk}); std::fill_n(vlocal.origin(), vlocal.num_elements(), SPComplexType(0.0)); - assert(Gw.num_elements() == nwalk * (nocca_tot + noccb_tot) * nmo_tot); + assert(Gw.num_elements() == nwalk * (nocca_tot + noccb_tot) * npol * nmo_tot); const_sp_pointer Gptr(nullptr); // I WANT C++17!!!!!! if (std::is_same::value) @@ -1331,7 +1338,7 @@ class KP3IndexFactorization comm->barrier(); } - boost::multi::array_cref G3Da(Gptr, {nocca_tot, nmo_tot, nwalk}); + boost::multi::array_cref G3Da(Gptr, {nocca_tot, npol, nmo_tot, nwalk}); boost::multi::array_cref G3Db(Gptr + G3Da.num_elements() * (nspin - 1), {noccb_tot, nmo_tot, nwalk}); @@ -1343,7 +1350,7 @@ class KP3IndexFactorization } comm->barrier(); - size_t nqk = 0; // start count at 1 to "offset" the calcuation of E1 done at root + size_t nqk = 0; for (int K = 0; K < nkpts; ++K) { // K is the index of the kpoint pair of (a,k) for (int Q = 0; Q < nkpts; ++Q) @@ -1361,11 +1368,23 @@ class KP3IndexFactorization int nk0 = std::accumulate(nopk.begin(), nopk.begin() + QKToK2[Q][K], 0); auto&& v1 = vlocal({0, nchol}, {0, nwalk}); - Sp3Tensor_ref Lank(to_address(LQKank[nd * nspin * nkpts + Q][K].origin()), {na, nchol, nk}); - - // v1[Q][n][nw] += sum_K sum_a_k LQK[a][n][k] G[a][k][nw] - for (int a = 0; a < na; ++a) - ma::product(one, Lank[a], G3Da[na0 + a]({nk0, nk0 + nk}, {0, nwalk}), one, v1); + if (walker_type == NONCOLLINEAR) { + Sp3Tensor_ref Lank(to_address(LQKank[nd * nspin * nkpts + Q][K].origin()), {na, nchol, npol*nk}); + // v1[Q][n][nw] += sum_K sum_a_p_k LQK[a][n][p][k] G[a][p][k][nw] + for (int a = 0; a < na; ++a) { + SpMatrix_ref Ga( to_address(vlocal.origin())+vlocal.num_elements(), {npol*nk,nwalk} ); + SpMatrix_ref Ga_( Ga.origin(), {npol,nk*nwalk} ); + for(int p=0; p Gca(to_address(GKaKj.origin()), {nocca_tot, nmo_tot, nwalk}); + assert(GKaKj.num_elements() == (nocca_tot + noccb_tot) * npol * nmo_tot * nwalk); + assert(GKKaj.num_elements() == nspin * nkpts * nkpts * npol * akmax * nwalk); + boost::multi::array_cref Gca(to_address(GKaKj.origin()), {nocca_tot, npol, nmo_tot, nwalk}); boost::multi::array_cref Gcb(to_address(GKaKj.origin()) + Gca.num_elements(), {noccb_tot, nmo_tot, nwalk}); - boost::multi::array_ref GKK(to_address(GKKaj.origin()), {nspin, nkpts, nkpts, nwalk * akmax}); + boost::multi::array_ref GKK(to_address(GKKaj.origin()), {nspin, nkpts, nkpts, nwalk * npol * akmax}); int na0 = 0; for (int Ka = 0, Kaj = 0; Ka < nkpts; Ka++) { @@ -1603,18 +1637,21 @@ class KP3IndexFactorization continue; } auto G_(to_address(GKK[0][Ka][Kj].origin())); - int naj = na * nj; - for (int a = 0, aj = 0; a < na; a++) + int naj = na * nj * npol; + for (int a = 0, asj = 0; a < na; a++) { - auto Gc_(to_address(Gca[na0 + a][nj0].origin())); - for (int j = 0; j < nj; j++, aj++) + for (int p = 0; p < npol; p++) { - for (int w = 0, waj = 0; w < nwalk; w++, ++Gc_, waj += naj) + auto Gc_(to_address(Gca[na0 + a][p][nj0].origin())); + for (int j = 0; j < nj; j++, asj++) + { + for (int w = 0, waj = 0; w < nwalk; w++, ++Gc_, waj += naj) #ifdef MIXED_PRECISION - G_[waj + aj] = static_cast(*Gc_); + G_[waj + asj] = static_cast(*Gc_); #else - G_[waj + aj] = (*Gc_); + G_[waj + asj] = (*Gc_); #endif + } } } nj0 += nj; diff --git a/src/AFQMC/HamiltonianOperations/tests/test_hamiltonian_operations.cpp b/src/AFQMC/HamiltonianOperations/tests/test_hamiltonian_operations.cpp index 63fa7c2251..066d3290d2 100644 --- a/src/AFQMC/HamiltonianOperations/tests/test_hamiltonian_operations.cpp +++ b/src/AFQMC/HamiltonianOperations/tests/test_hamiltonian_operations.cpp @@ -244,14 +244,17 @@ void ham_ops_basic_serial(boost::mpi3::communicator& world) HOps.vbias(Gw,X,sqrtdt); } TG.local_barrier(); - ComplexType Xsum = 0; - for(int i=0; i1e-8) { REQUIRE( real(Xsum) == Approx(real(file_data.Xsum)) ); REQUIRE( imag(Xsum) == Approx(imag(file_data.Xsum)) ); } else { app_log()<<" Xsum: " <>(nmo_per_kp, PsiT[nd], K, npol==2); assert(Psi.size(0) == na); boost::multi::array_ref haj_r(to_address(haj[nd * nkpts + K].origin()), {na, npol*ni}); if (na > 0) - ma::product(ComplexType(2.0), Psi, H1[K]({0, npol*ni}, {0, npol*ni}), ComplexType(0.0), haj_r); + ma::product(ComplexType(scl), Psi, H1[K]({0, npol*ni}, {0, npol*ni}), ComplexType(0.0), haj_r); } } } @@ -536,13 +537,13 @@ HamiltonianOperations KPFactorizedHamiltonian::getHamiltonianOperations_shared(b if (Q <= Qm) { Sp3Tensor_ref Likn(to_address(LQKikn[Q][K].origin()), {ni, nk, nchol}); - Sp3Tensor_ref Lank(to_address(LQKank[nq0 + Q][K].origin()), {na*npol, nchol, nk}); + Sp3Tensor_ref Lank(to_address(LQKank[nq0 + Q][K].origin()), {na, nchol, npol*nk}); ma_rotate::getLank(Psi, Likn, Lank, buff, npol==2); } else { Sp3Tensor_ref Lkin(to_address(LQKikn[Qm][QK].origin()), {nk, ni, nchol}); - Sp3Tensor_ref Lank(to_address(LQKank[nq0 + Q][K].origin()), {na*npol, nchol, nk}); + Sp3Tensor_ref Lank(to_address(LQKank[nq0 + Q][K].origin()), {na, nchol, npol*nk}); ma_rotate::getLank_from_Lkin(Psi, Lkin, Lank, buff, npol==2); } } @@ -590,7 +591,7 @@ HamiltonianOperations KPFactorizedHamiltonian::getHamiltonianOperations_shared(b { auto PsiQK = get_PsiK(nmo_per_kp, PsiT[nd], QK, npol==2); assert(PsiQK.size(0) == na); - Sp3Tensor_ref Lbnl(to_address(LQKbnl[nq0 + Qmap[Q] - 1][QK].origin()), {na*npol, nchol, ni}); + Sp3Tensor_ref Lbnl(to_address(LQKbnl[nq0 + Qmap[Q] - 1][QK].origin()), {na, nchol, npol*ni}); ma_rotate::getLank_from_Lkin(PsiQK, Likn, Lbnl, buff, npol==2); } } @@ -1231,11 +1232,12 @@ HamiltonianOperations KPFactorizedHamiltonian::getHamiltonianOperations_batched( } else { + RealType scl = (type == CLOSED ? 2.0 : 1.0); auto Psi = get_PsiK>(nmo_per_kp, PsiT[nd], K, npol==2); assert(Psi.size(0) == na); boost::multi::array_ref haj_r(to_address(haj[nd * nkpts + K].origin()), {nocc_max, npol*nmo_max}); if (na > 0) - ma::product(ComplexType(2.0), Psi, H1[K]({0, npol*ni}, {0, npol*ni}), ComplexType(0.0), haj_r({0, na}, {0, npol*ni})); + ma::product(ComplexType(scl), Psi, H1[K]({0, npol*ni}, {0, npol*ni}), ComplexType(0.0), haj_r({0, na}, {0, npol*ni})); } } } @@ -1303,15 +1305,15 @@ HamiltonianOperations KPFactorizedHamiltonian::getHamiltonianOperations_batched( if (Q <= Qm) { Sp3Tensor_ref Likn(to_address(LQKikn[Q][K].origin()), {nmo_max, nmo_max, nchol_max}); - Sp3Tensor_ref Lakn(to_address(LQKakn[nq0 + Q][K].origin()), {npol*nocc_max, nmo_max, nchol_max}); - Sp3Tensor_ref Lank(to_address(LQKank[nq0 + Q][K].origin()), {npol*nocc_max, nchol_max, nmo_max}); + Sp3Tensor_ref Lakn(to_address(LQKakn[nq0 + Q][K].origin()), {nocc_max, npol*nmo_max, nchol_max}); + Sp3Tensor_ref Lank(to_address(LQKank[nq0 + Q][K].origin()), {nocc_max, nchol_max, npol*nmo_max}); ma_rotate_padded::getLakn_Lank(Psi, Likn, Lakn, Lank, npol==2); } else { Sp3Tensor_ref Lkin(to_address(LQKikn[Qm][QK].origin()), {nmo_max, nmo_max, nchol_max}); - Sp3Tensor_ref Lakn(to_address(LQKakn[nq0 + Q][K].origin()), {npol*nocc_max, nmo_max, nchol_max}); - Sp3Tensor_ref Lank(to_address(LQKank[nq0 + Q][K].origin()), {npol*nocc_max, nchol_max, nmo_max}); + Sp3Tensor_ref Lakn(to_address(LQKakn[nq0 + Q][K].origin()), {nocc_max, npol*nmo_max, nchol_max}); + Sp3Tensor_ref Lank(to_address(LQKank[nq0 + Q][K].origin()), {nocc_max, nchol_max, npol*nmo_max}); ma_rotate_padded::getLakn_Lank_from_Lkin(Psi, Lkin, Lakn, Lank, buff, npol==2); } } @@ -1359,8 +1361,8 @@ HamiltonianOperations KPFactorizedHamiltonian::getHamiltonianOperations_batched( { auto PsiQK = get_PsiK(nmo_per_kp, PsiT[nd], QK, npol==2); assert(PsiQK.size(0) == na); - Sp3Tensor_ref Lbnl(to_address(LQKbnl[nq0 + Qmap[Q] - 1][QK].origin()), {npol*nocc_max, nchol_max, nmo_max}); - Sp3Tensor_ref Lbln(to_address(LQKbln[nq0 + Qmap[Q] - 1][QK].origin()), {npol*nocc_max, nmo_max, nchol_max}); + Sp3Tensor_ref Lbnl(to_address(LQKbnl[nq0 + Qmap[Q] - 1][QK].origin()), {nocc_max, nchol_max, npol*nmo_max}); + Sp3Tensor_ref Lbln(to_address(LQKbln[nq0 + Qmap[Q] - 1][QK].origin()), {nocc_max, npol*nmo_max, nchol_max}); ma_rotate_padded::getLakn_Lank_from_Lkin(PsiQK, Likn, Lbln, Lbnl, buff, npol==2); } } diff --git a/src/AFQMC/SlaterDeterminantOperations/SlaterDetOperations_base.hpp b/src/AFQMC/SlaterDeterminantOperations/SlaterDetOperations_base.hpp index 78fe074d40..949404693e 100644 --- a/src/AFQMC/SlaterDeterminantOperations/SlaterDetOperations_base.hpp +++ b/src/AFQMC/SlaterDeterminantOperations/SlaterDetOperations_base.hpp @@ -235,31 +235,42 @@ class SlaterDetOperations_base } template - void Propagate(Mat&& A, const MatP1& P1, const MatV& V, int order = 6, char TA = 'N') + void Propagate(Mat&& A, const MatP1& P1, const MatV& V, + int order = 6, char TA = 'N', bool noncollinear = false) { + int npol = noncollinear ? 2 : 1; int NMO = A.size(0); int NAEA = A.size(1); + int M = NMO/npol; + assert(NMO%npol == 0); + assert(P1.size(0) == NMO); + assert(P1.size(1) == NMO); + assert(V.size(0) == M); + assert(V.size(1) == M); TMatrix TMN({NMO, NAEA}, buffer_generator->template get_allocator()); - TMatrix T1({NMO, NAEA}, buffer_generator->template get_allocator()); - TMatrix T2({NMO, NAEA}, buffer_generator->template get_allocator()); + TMatrix T1({M, NAEA}, buffer_generator->template get_allocator()); + TMatrix T2({M, NAEA}, buffer_generator->template get_allocator()); using ma::H; using ma::T; if (TA == 'H' || TA == 'h') { ma::product(ma::H(P1), std::forward(A), TMN); - SlaterDeterminantOperations::base::apply_expM(V, TMN, T1, T2, order, TA); + for(int p=0; p(A)); } else if (TA == 'T' || TA == 't') { ma::product(ma::T(P1), std::forward(A), TMN); - SlaterDeterminantOperations::base::apply_expM(V, TMN, T1, T2, order, TA); + for(int p=0; p(A)); } else { ma::product(P1, std::forward(A), TMN); - SlaterDeterminantOperations::base::apply_expM(V, TMN, T1, T2, order); + for(int p=0; p(A)); } } diff --git a/src/AFQMC/SlaterDeterminantOperations/SlaterDetOperations_serial.hpp b/src/AFQMC/SlaterDeterminantOperations/SlaterDetOperations_serial.hpp index 0341cd8172..fca3c36ae2 100644 --- a/src/AFQMC/SlaterDeterminantOperations/SlaterDetOperations_serial.hpp +++ b/src/AFQMC/SlaterDeterminantOperations/SlaterDetOperations_serial.hpp @@ -136,7 +136,7 @@ class SlaterDetOperations_serial : public SlaterDetOperations_base - void BatchedPropagate(std::vector& Ai, const MatP1& P1, const MatV& V, int order = 6, char TA = 'N') + void BatchedPropagate(std::vector& Ai, const MatP1& P1, const MatV& V, int order = 6, char TA = 'N', bool noncollinear = false) { static_assert(pointedType::dimensionality == 2, " dimenionality == 2"); static_assert(std::decay::type::dimensionality == 3, " dimenionality == 3"); @@ -144,35 +144,59 @@ class SlaterDetOperations_serial : public SlaterDetOperations_basetemplate get_allocator()); TTensor T1({nbatch, NMO, NAEA}, buffer_generator->template get_allocator()); TTensor T2({nbatch, NMO, NAEA}, buffer_generator->template get_allocator()); - using ma::H; - using ma::T; // could be batched when csrmm is batched if (TA == 'H' || TA == 'h') { for (int ib = 0; ib < nbatch; ib++) ma::product(ma::H(P1), *Ai[ib], TMN[ib]); - SlaterDeterminantOperations::batched::apply_expM(V, TMN, T1, T2, order, TA); - for (int ib = 0; ib < nbatch; ib++) - ma::product(ma::H(P1), TMN[ib], *Ai[ib]); } else if (TA == 'T' || TA == 't') { for (int ib = 0; ib < nbatch; ib++) ma::product(ma::T(P1), *Ai[ib], TMN[ib]); + } + else + { + for (int ib = 0; ib < nbatch; ib++) + ma::product(P1, *(Ai[ib]), TMN[ib]); + } + + // Apply V + if(noncollinear) { + // treat 2 polarizations as separate elements in the batch + using TTensor_ref = boost::multi::array_ref; + TTensor_ref TMN_( TMN.origin(), {npol*nbatch, M, NAEA}); + TTensor_ref T1_( T1.origin(), {npol*nbatch, M, NAEA}); + TTensor_ref T2_( T2.origin(), {npol*nbatch, M, NAEA}); + SlaterDeterminantOperations::batched::apply_expM(V, TMN_, T1_, T2_, order, TA); + } else { SlaterDeterminantOperations::batched::apply_expM(V, TMN, T1, T2, order, TA); + } + + if (TA == 'H' || TA == 'h') + { + for (int ib = 0; ib < nbatch; ib++) + ma::product(ma::H(P1), TMN[ib], *Ai[ib]); + } + else if (TA == 'T' || TA == 't') + { for (int ib = 0; ib < nbatch; ib++) ma::product(ma::T(P1), TMN[ib], *Ai[ib]); } else { - for (int ib = 0; ib < nbatch; ib++) - ma::product(P1, *(Ai[ib]), TMN[ib]); - SlaterDeterminantOperations::batched::apply_expM(V, TMN, T1, T2, order); for (int ib = 0; ib < nbatch; ib++) ma::product(P1, TMN[ib], *Ai[ib]); } diff --git a/src/AFQMC/SlaterDeterminantOperations/SlaterDetOperations_shared.hpp b/src/AFQMC/SlaterDeterminantOperations/SlaterDetOperations_shared.hpp index 2e39bb1123..f205d38dc8 100644 --- a/src/AFQMC/SlaterDeterminantOperations/SlaterDetOperations_shared.hpp +++ b/src/AFQMC/SlaterDeterminantOperations/SlaterDetOperations_shared.hpp @@ -159,15 +159,23 @@ class SlaterDetOperations_shared : public SlaterDetOperations_base - void Propagate(Mat&& A, const MatP1& P1, const MatV& V, communicator& comm, int order = 6, char TA = 'N') + void Propagate(Mat&& A, const MatP1& P1, const MatV& V, communicator& comm, + int order = 6, char TA = 'N', bool noncollinear = false) { + int npol = noncollinear ? 2 : 1; int NMO = A.size(0); int NAEA = A.size(1); - set_shm_buffer(comm, 3 * NAEA * NMO); - assert(SM_TMats->num_elements() >= 3 * NAEA * NAEA); + int M = NMO/npol; + assert(NMO%npol == 0); + assert(P1.size(0) == NMO); + assert(P1.size(1) == NMO); + assert(V.size(0) == M); + assert(V.size(1) == M); + set_shm_buffer(comm, NAEA * (NMO + 2*M) ); + assert(SM_TMats->num_elements() >= NAEA * (NMO + 2*M) ); boost::multi::array_ref T0(to_address(SM_TMats->origin()), {NMO, NAEA}); - boost::multi::array_ref T1(to_address(SM_TMats->origin()) + NMO * NAEA, {NMO, NAEA}); - boost::multi::array_ref T2(to_address(SM_TMats->origin()) + 2 * NMO * NAEA, {NMO, NAEA}); + boost::multi::array_ref T1(to_address(T0.origin()) + T0.num_elements(), {M, NAEA}); + boost::multi::array_ref T2(to_address(T1.origin()) + T1.num_elements(), {M, NAEA}); using ma::H; using ma::T; if (comm.root()) @@ -180,7 +188,8 @@ class SlaterDetOperations_shared : public SlaterDetOperations_base(A), T0); } comm.barrier(); - SlaterDeterminantOperations::shm::apply_expM(V, T0, T1, T2, comm, order, TA); + for(int p=0; p - void BatchedPropagate(std::vector& Ai, const MatP1& P1, const MatV& V, int order = 6, char TA = 'N') + void BatchedPropagate(std::vector& Ai, const MatP1& P1, const MatV& V, int order = 6, char TA = 'N', bool noncollinear=false) { APP_ABORT(" Error: Batched routines not compatible with SlaterDetOperations_shared::BatchedPropagate \n"); } diff --git a/src/AFQMC/SlaterDeterminantOperations/apply_expM.hpp b/src/AFQMC/SlaterDeterminantOperations/apply_expM.hpp index 24871376ac..ea1dd664b5 100644 --- a/src/AFQMC/SlaterDeterminantOperations/apply_expM.hpp +++ b/src/AFQMC/SlaterDeterminantOperations/apply_expM.hpp @@ -34,7 +34,7 @@ namespace base * Calculate S = exp(im*V)*S using a Taylor expansion of exp(V) */ template -inline void apply_expM(const MatA& V, MatB& S, MatC& T1, MatC& T2, int order = 6, char TA = 'N') +inline void apply_expM(const MatA& V, MatB&& S, MatC& T1, MatC& T2, int order = 6, char TA = 'N') { assert(V.size(0) == V.size(1)); assert(V.size(1) == S.size(0)); @@ -80,7 +80,7 @@ namespace shm * V, S, T1, T2 are expected to be in shared memory. */ template -inline void apply_expM(const MatA& V, MatB& S, MatC& T1, MatC& T2, communicator& comm, int order = 6, char TA = 'N') +inline void apply_expM(const MatA& V, MatB&& S, MatC& T1, MatC& T2, communicator& comm, int order = 6, char TA = 'N') { assert(V.size(0) == S.size(0)); assert(V.size(1) == S.size(0)); @@ -134,13 +134,14 @@ namespace batched * Calculate S = exp(im*V)*S using a Taylor expansion of exp(V) */ template -inline void apply_expM(const MatA& V, MatB& S, MatC& T1, MatC& T2, int order = 6, char TA = 'N') +inline void apply_expM(const MatA& V, MatB&& S, MatC& T1, MatC& T2, + int order = 6, char TA = 'N') { static_assert(std::decay::type::dimensionality == 3, " batched::apply_expM::dimenionality == 3"); static_assert(std::decay::type::dimensionality == 3, " batched::apply_expM::dimenionality == 3"); static_assert(std::decay::type::dimensionality == 3, " batched::apply_expM::dimenionality == 3"); - assert(V.size(0) == S.size(0)); - assert(V.size(0) == T1.size(0)); + assert(V.size(0) == S.size(0)); + assert(V.size(0) == T1.size(0)); assert(V.size(0) == T2.size(0)); assert(V.size(1) == V.size(2)); assert(V.size(2) == S.size(1)); diff --git a/src/AFQMC/SlaterDeterminantOperations/rotate.hpp b/src/AFQMC/SlaterDeterminantOperations/rotate.hpp index 86c98c02b8..cdf85807d4 100644 --- a/src/AFQMC/SlaterDeterminantOperations/rotate.hpp +++ b/src/AFQMC/SlaterDeterminantOperations/rotate.hpp @@ -617,8 +617,8 @@ void halfRotateCholeskyMatrix(WALKER_TYPES type, * Case: * - Closed/Collinear: L[a][n][k] = sum_i A[a][i] L[i][k][n] * - In collinear case, two separate calls are made for each spin channel. - * - Non-collinear: L[a][s][n][k] = sum_i A[a][si] L[i][k][n] // [si] == [s][i] combined spinor index - * - In this case, to preserve matrix dimenions, [a][s] --> [as] is kept as a single index, peculiar order is given by its use in Energy evaluation. + * - Non-collinear: L[a][n][sk] = sum_i A[a][si] L[i][k][n] // [si] == [s][i] combined spinor index + * - In this case, to preserve matrix dimenions, [s][k] --> [sk] is kept as a single index. */ template void getLank(MultiArray2DA&& Aai, MultiArray3DB&& Likn, MultiArray3DC&& Lank, MultiArray2D&& buff, bool noncollinear=false) @@ -631,10 +631,10 @@ void getLank(MultiArray2DA&& Aai, MultiArray3DB&& Likn, MultiArray3DC&& Lank, Mu int nk = Likn.size(1); int nchol = Likn.size(2); assert(Likn.size(0) == ni); - assert(Lank.size(0) == na*npol); + assert(Lank.size(0) == na); assert(Lank.size(1) == nchol); - assert(Lank.size(2) == nk); - assert(buff.size(0) >= nk); + assert(Lank.size(2) == nk*npol); + assert(buff.size(0) >= npol * nk); assert(buff.size(1) >= nchol); if(noncollinear) assert(Aai.stride(0) == Aai.size(1)); // make sure it is contiguous @@ -646,12 +646,12 @@ void getLank(MultiArray2DA&& Aai, MultiArray3DB&& Likn, MultiArray3DC&& Lank, Mu boost::multi::array_ref Las_kn(to_address(Lank.origin()), {na * npol, nk * nchol}); ma::product(Aas_i, Li_kn, Las_kn); - for (int a = 0; a < na * npol; a++) + for (int a = 0; a < na; a++) { - boost::multi::array_ref Lkn(to_address(Lank[a].origin()), {nk, nchol}); - boost::multi::array_ref Lnk(to_address(Lank[a].origin()), {nchol, nk}); - buff({0, nk}, {0, nchol}) = Lkn; - ma::transpose(buff({0, nk}, {0, nchol}), Lnk); + boost::multi::array_ref Lskn(to_address(Lank[a].origin()), {npol * nk, nchol}); + boost::multi::array_ref Lnsk(to_address(Lank[a].origin()), {nchol, npol * nk}); + buff({0, npol * nk}, {0, nchol}) = Lskn; + ma::transpose(buff({0, npol * nk}, {0, nchol}), Lnsk); } } @@ -660,8 +660,8 @@ void getLank(MultiArray2DA&& Aai, MultiArray3DB&& Likn, MultiArray3DC&& Lank, Mu * Case: * - Closed/Collinear: L[a][n][k] = sum_i A[a][i] conj(L[k][i][n]) * - In collinear case, two separate calls are made for each spin channel. - * - Non-collinear: L[a][s][n][k] = sum_i A[a][si] conj(L[k][i][n]) // [si] == [s][i] combined spinor index - * - In this case, to preserve matrix dimenions, [a][s] --> [as] is kept as a single index, peculiar order is given by its use in Energy evaluation. + * - Non-collinear: L[a][n][sk] = sum_i A[a][si] conj(L[k][i][n]) // [si] == [s][i] combined spinor index + * - In this case, to preserve matrix dimenions, [s][k] --> [sk] is kept as a single index. */ template void getLank_from_Lkin(MultiArray2DA&& Aai, MultiArray3DB&& Lkin, MultiArray3DC&& Lank, MultiArray2D&& buff, bool noncollinear=false) @@ -674,9 +674,9 @@ void getLank_from_Lkin(MultiArray2DA&& Aai, MultiArray3DB&& Lkin, MultiArray3DC& int nk = Lkin.size(0); int nchol = Lkin.size(2); assert(Lkin.size(1) == ni); - assert(Lank.size(0) == na * npol); + assert(Lank.size(0) == na); assert(Lank.size(1) == nchol); - assert(Lank.size(2) == nk); + assert(Lank.size(2) == nk * npol); assert(buff.num_elements() >= na * npol * nchol); if(noncollinear) assert(Aai.stride(0) == Aai.size(1)); // make sure it is contiguous @@ -684,15 +684,16 @@ void getLank_from_Lkin(MultiArray2DA&& Aai, MultiArray3DB&& Lkin, MultiArray3DC& using Type = typename std::decay::type::element; using elementA = typename std::decay::type::element; boost::multi::array_ref Aas_i(to_address(Aai.origin()), {na * npol, ni}); - boost::multi::array_ref bna(to_address(buff.origin()), {nchol, na * npol}); + boost::multi::array_ref bnas(to_address(buff.origin()), {nchol, na * npol}); // Lank[a][n][k] = sum_i Aai[a][i] conj(Lkin[k][i][n]) // Lank[as][n][k] = sum_i Aai[as][i] conj(Lkin[k][i][n]) for (int k = 0; k < nk; k++) { - ma::product(ma::H(Lkin[k]), ma::T(Aas_i), bna); - for (int a = 0; a < na * npol; a++) + ma::product(ma::H(Lkin[k]), ma::T(Aas_i), bnas); + for (int a = 0; a < na; a++) for (int n = 0; n < nchol; n++) - Lank[a][n][k] = bna[n][a]; + for (int p = 0; p < npol; p++) + Lank[a][n][p*nk+k] = bnas[n][a*npol+p]; } } @@ -714,13 +715,13 @@ void getLakn_Lank(MultiArray2DA&& Aai, MultiArray3DB&& Likn, MultiArray3DC&& Lak int nchol = Likn.size(2); assert(Likn.size(1) == nmo); - assert(Lakn.size(1) == nmo); + assert(Lakn.size(1) == npol*nmo); assert(Lakn.size(2) == nchol); - assert(Lakn.size(0) >= npol*na ); + assert(Lakn.size(0) >= na ); assert(Lakn.size(0) == Lank.size(0)); assert(Lank.size(1) == nchol); - assert(Lank.size(2) == nmo); + assert(Lank.size(2) == npol*nmo); if(noncollinear) assert(Aai.stride(0) == Aai.size(1)); // make sure it is contiguous @@ -756,13 +757,13 @@ void getLakn_Lank_from_Lkin(MultiArray2DA&& Aai, int nchol = Lkin.size(2); assert(Lkin.size(1) == nmo); - assert(Lakn.size(1) == nmo); + assert(Lakn.size(1) == npol*nmo); assert(Lakn.size(2) == nchol); - assert(Lakn.size(0) >= npol*na ); + assert(Lakn.size(0) >= na ); assert(Lakn.size(0) == Lank.size(0)); assert(Lank.size(1) == nchol); - assert(Lank.size(2) == nmo); + assert(Lank.size(2) == npol*nmo); if(noncollinear) assert(Aai.stride(0) == Aai.size(1)); // make sure it is contiguous @@ -774,15 +775,16 @@ void getLakn_Lank_from_Lkin(MultiArray2DA&& Aai, using elmA = typename std::decay::type::element; boost::multi::array_ref Aas_i(to_address(Aai.origin()), {na * npol, ni}); - boost::multi::array_ref bna(buff.origin(), {nchol, na * npol}); - // Lakn[as][k][n] = sum_i Aai[as][i] conj(Lkin[k][i][n]) + boost::multi::array_ref bnas(buff.origin(), {nchol, na * npol}); + // Lakn[a][sk][n] = sum_i Aai[as][i] conj(Lkin[k][i][n]) for (int k = 0; k < nmo; k++) { - ma::product(ma::H(Lkin[k].sliced(0, ni)), ma::T(Aai), bna); - for (int a = 0; a < na * npol; a++) - Lakn[a][k] = bna({0, nchol}, a); + ma::product(ma::H(Lkin[k].sliced(0, ni)), ma::T(Aai), bnas); + for (int a = 0; a < na; a++) + for (int p = 0; p < npol; p++) + Lakn[a][p*nmo+k] = bnas({0, nchol}, a*npol+p); } - for (int a = 0; a < na * npol; a++) + for (int a = 0; a < na; a++) ma::transpose(Lakn[a], Lank[a]); } diff --git a/src/AFQMC/Wavefunctions/NOMSD.hpp b/src/AFQMC/Wavefunctions/NOMSD.hpp index 4cb29a64d6..d92a1b2399 100644 --- a/src/AFQMC/Wavefunctions/NOMSD.hpp +++ b/src/AFQMC/Wavefunctions/NOMSD.hpp @@ -753,7 +753,7 @@ class NOMSD : public AFQMCInfo return (full) ? (2 * NMO * NMO) : ((NAEA + NAEB) * NMO); break; case NONCOLLINEAR: - return (full) ? (4 * NMO * NMO) : ((NAEA + NAEB) * 2 * NMO); + return (full) ? (4 * NMO * NMO) : (NAEA * 2 * NMO); break; default: APP_ABORT(" Error: Unknown walker_type in dm_size. \n"); @@ -773,7 +773,7 @@ class NOMSD : public AFQMCInfo return (full) ? (arr{NMO, NMO}) : ((sp == Alpha) ? (arr{NAEA, NMO}) : (arr{NAEB, NMO})); break; case NONCOLLINEAR: - return (full) ? (arr{2 * NMO, 2 * NMO}) : (arr{NAEA + NAEB, 2 * NMO}); + return (full) ? (arr{2 * NMO, 2 * NMO}) : (arr{NAEA, 2 * NMO}); break; default: APP_ABORT(" Error: Unknown walker_type in dm_size. \n"); diff --git a/src/AFQMC/Wavefunctions/WavefunctionFactory.cpp b/src/AFQMC/Wavefunctions/WavefunctionFactory.cpp index 6c78c82bd1..78f1d6c01d 100644 --- a/src/AFQMC/Wavefunctions/WavefunctionFactory.cpp +++ b/src/AFQMC/Wavefunctions/WavefunctionFactory.cpp @@ -88,6 +88,9 @@ Wavefunction WavefunctionFactory::fromASCII(TaskGroup_& TGprop, int NMO = AFinfo.NMO; int NAEA = AFinfo.NAEA; int NAEB = AFinfo.NAEB; + int NPOL = (walker_type == NONCOLLINEAR) ? 2 : 1; + if( (walker_type == NONCOLLINEAR) && (NAEB != 0) ) + APP_ABORT(" Error in Wavefunctions/WavefunctionFactory::fromASCII: noncollinear && NAEB!=0. \n\n\n "); std::ifstream in; in.open(filename.c_str()); @@ -114,7 +117,7 @@ Wavefunction WavefunctionFactory::fromASCII(TaskGroup_& TGprop, read_ph_wavefunction(in, ndets_to_read, walker_type, TGwfn.Node(), NMO, NAEA, NAEB, PsiT_MO); assert(abij.number_of_configurations() == ndets_to_read); int NEL = (walker_type == NONCOLLINEAR) ? (NAEA + NAEB) : NAEA; - int N_ = (walker_type == NONCOLLINEAR) ? 2 * NMO : NMO; + int N_ = NPOL * NMO; ComplexType one(1.0, 0.0); if (walker_type == COLLINEAR) PsiT.reserve(2 * ndets_to_read); @@ -290,14 +293,14 @@ Wavefunction WavefunctionFactory::fromASCII(TaskGroup_& TGprop, auto guess = initial_guess.find(name); if (guess == initial_guess.end()) { - auto newg = initial_guess.insert(std::make_pair(name, boost::multi::array({2, NMO, NAEA}))); + auto newg = initial_guess.insert(std::make_pair(name, boost::multi::array({2, NPOL*NMO, NAEA}))); int iC = (walker_type != COLLINEAR ? initial_configuration : 2 * initial_configuration); if (iC >= PsiT.size()) APP_ABORT(" Error: initial_configuration > ndets_to_read \n"); if (!newg.second) APP_ABORT(" Error: Problems adding new initial guess. \n"); using ma::conj; - std::fill_n((newg.first)->second.origin(), 2 * NMO * NAEA, ComplexType(0.0, 0.0)); + std::fill_n((newg.first)->second.origin(), 2 * NPOL * NMO * NAEA, ComplexType(0.0, 0.0)); { auto pbegin = PsiT[iC].pointers_begin(); auto pend = PsiT[iC].pointers_end(); @@ -329,22 +332,15 @@ Wavefunction WavefunctionFactory::fromASCII(TaskGroup_& TGprop, if (TGwfn.TG_local().size() > 1) { - SlaterDetOperations SDetOp( - SlaterDetOperations_shared(((walker_type != NONCOLLINEAR) ? (NMO) : (2 * NMO)), - ((walker_type != NONCOLLINEAR) ? (NAEA) : (NAEA + NAEB)))); + SlaterDetOperations SDetOp(SlaterDetOperations_shared(NPOL*NMO,NAEA)); return Wavefunction(NOMSD(AFinfo, cur, TGwfn, std::move(SDetOp), std::move(HOps), std::move(ci), std::move(PsiT), walker_type, NCE, targetNW)); } else { SlaterDetOperations SDetOp( - SlaterDetOperations_serial(((walker_type != NONCOLLINEAR) - ? (NMO) - : (2 * NMO)), - ((walker_type != NONCOLLINEAR) - ? (NAEA) - : (NAEA + NAEB)), - device_buffer_generator.get())); + SlaterDetOperations_serial( + NPOL*NMO,NAEA,device_buffer_generator.get())); return Wavefunction(NOMSD(AFinfo, cur, TGwfn, std::move(SDetOp), std::move(HOps), std::move(ci), std::move(PsiT), walker_type, NCE, targetNW)); } @@ -362,6 +358,7 @@ Wavefunction WavefunctionFactory::fromASCII(TaskGroup_& TGprop, */ // assuming walker_type==COLLINEAR for now, specialize a type for perfect pairing PHMSD +// MAM: generatlize for NONCOLLINEAR later!!! if (walker_type != COLLINEAR) APP_ABORT("Error: PHMSD requires a COLLINEAR calculation.\n"); std::vector PsiT_MO; @@ -717,6 +714,9 @@ Wavefunction WavefunctionFactory::fromHDF5(TaskGroup_& TGprop, int NMO = AFinfo.NMO; int NAEA = AFinfo.NAEA; int NAEB = AFinfo.NAEB; + int NPOL = (walker_type == NONCOLLINEAR) ? 2 : 1; + if( (walker_type == NONCOLLINEAR) && (NAEB != 0) ) + APP_ABORT(" Error in Wavefunctions/WavefunctionFactory::fromASCII: noncollinear && NAEB!=0. \n\n\n "); std::vector excitations; @@ -832,9 +832,7 @@ Wavefunction WavefunctionFactory::fromHDF5(TaskGroup_& TGprop, if (TGwfn.TG_local().size() > 1) { - SlaterDetOperations SDetOp( - SlaterDetOperations_shared(((walker_type != NONCOLLINEAR) ? (NMO) : (2 * NMO)), - ((walker_type != NONCOLLINEAR) ? (NAEA) : (NAEA + NAEB)))); + SlaterDetOperations SDetOp(SlaterDetOperations_shared(NPOL*NMO,NAEA)); if (dense_trial == "yes") { using MType = ComplexMatrix>; @@ -858,13 +856,8 @@ Wavefunction WavefunctionFactory::fromHDF5(TaskGroup_& TGprop, else { SlaterDetOperations SDetOp( - SlaterDetOperations_serial(((walker_type != NONCOLLINEAR) - ? (NMO) - : (2 * NMO)), - ((walker_type != NONCOLLINEAR) - ? (NAEA) - : (NAEA + NAEB)), - device_buffer_generator.get())); + SlaterDetOperations_serial( + NPOL*NMO,NAEA,device_buffer_generator.get())); if (dense_trial == "yes") { using MType = ComplexMatrix>; @@ -1125,6 +1118,7 @@ void WavefunctionFactory::getInitialGuess(hdf_archive& dump, int NAEB, WALKER_TYPES walker_type) { + int NPOL = (walker_type==NONCOLLINEAR) ? 2 : 1; std::vector dims(5); if (!dump.readEntry(dims, "dims")) { @@ -1135,19 +1129,19 @@ void WavefunctionFactory::getInitialGuess(hdf_archive& dump, auto guess = initial_guess.find(name); if (guess == initial_guess.end()) { - auto newg = initial_guess.insert(std::make_pair(name, boost::multi::array({2, NMO, NAEA}))); + auto newg = initial_guess.insert(std::make_pair(name, boost::multi::array({2, NPOL*NMO, NAEA}))); if (!newg.second) APP_ABORT(" Error: Problems adding new initial guess. \n"); using ma::conj; - std::fill_n((newg.first)->second.origin(), 2 * NMO * NAEA, ComplexType(0.0, 0.0)); + std::fill_n((newg.first)->second.origin(), 2 * NPOL * NMO * NAEA, ComplexType(0.0, 0.0)); { - boost::multi::array Psi0Alpha({NMO, NAEA}); + boost::multi::array Psi0Alpha({NPOL*NMO, NAEA}); if (!dump.readEntry(Psi0Alpha, "Psi0_alpha")) { app_error() << " Error in WavefunctionFactory: Initial wavefunction Psi0_alpha not found. \n"; APP_ABORT(""); } - for (int i = 0; i < NMO; i++) + for (int i = 0; i < NPOL*NMO; i++) for (int j = 0; j < NAEA; j++) ((newg.first)->second)[0][i][j] = Psi0Alpha[i][j]; } diff --git a/src/AFQMC/Wavefunctions/tests/test_wfn_factory.cpp b/src/AFQMC/Wavefunctions/tests/test_wfn_factory.cpp index bed417e64e..d39ac3dcaa 100644 --- a/src/AFQMC/Wavefunctions/tests/test_wfn_factory.cpp +++ b/src/AFQMC/Wavefunctions/tests/test_wfn_factory.cpp @@ -86,6 +86,7 @@ void wfn_fac(boost::mpi3::communicator& world) int NAEA = file_data.NAEA; int NAEB = file_data.NAEB; WALKER_TYPES type = afqmc::getWalkerType(UTEST_WFN); + int NPOL = (type == NONCOLLINEAR) ? 2 : 1; std::map InfoMap; InfoMap.insert(std::pair("info0", AFQMCInfo{"info0", NMO, NAEA, NAEB})); @@ -160,7 +161,7 @@ void wfn_fac(boost::mpi3::communicator& world) WalkerSet wset(TG, doc3.getRoot(), InfoMap["info0"], &rng); auto initial_guess = WfnFac.getInitialGuess(wfn_name); REQUIRE(initial_guess.size(0) == 2); - REQUIRE(initial_guess.size(1) == NMO); + REQUIRE(initial_guess.size(1) == NPOL*NMO); REQUIRE(initial_guess.size(2) == NAEA); if (type == COLLINEAR) @@ -298,7 +299,7 @@ void wfn_fac(boost::mpi3::communicator& world) WalkerSet wset2(TG, doc3.getRoot(), InfoMap["info0"], &rng); //auto initial_guess = WfnFac.getInitialGuess(wfn_name); REQUIRE(initial_guess.size(0) == 2); - REQUIRE(initial_guess.size(1) == NMO); + REQUIRE(initial_guess.size(1) == NPOL*NMO); REQUIRE(initial_guess.size(2) == NAEA); if (type == COLLINEAR) @@ -431,6 +432,7 @@ void wfn_fac_distributed(boost::mpi3::communicator& world, int ngroups) int NAEA = file_data.NAEA; int NAEB = file_data.NAEB; WALKER_TYPES type = afqmc::getWalkerType(UTEST_WFN); + int NPOL = (type == NONCOLLINEAR) ? 2 : 1; std::map InfoMap; InfoMap.insert(std::pair("info0", AFQMCInfo{"info0", NMO, NAEA, NAEB})); @@ -503,7 +505,7 @@ void wfn_fac_distributed(boost::mpi3::communicator& world, int ngroups) WalkerSet wset(TG, doc3.getRoot(), InfoMap["info0"], &rng); auto initial_guess = WfnFac.getInitialGuess(wfn_name); REQUIRE(initial_guess.size(0) == 2); - REQUIRE(initial_guess.size(1) == NMO); + REQUIRE(initial_guess.size(1) == NPOL*NMO); REQUIRE(initial_guess.size(2) == NAEA); if (type == COLLINEAR) @@ -662,7 +664,7 @@ void wfn_fac_distributed(boost::mpi3::communicator& world, int ngroups) WalkerSet wset2(TG, doc3.getRoot(), InfoMap["info0"], &rng); //auto initial_guess = WfnFac.getInitialGuess(wfn_name); REQUIRE(initial_guess.size(0) == 2); - REQUIRE(initial_guess.size(1) == NMO); + REQUIRE(initial_guess.size(1) == NPOL*NMO); REQUIRE(initial_guess.size(2) == NAEA); if (type == COLLINEAR) From 41afd00404d7fd17bd888bad423d5cb300d98cfa Mon Sep 17 00:00:00 2001 From: mmorale3 Date: Tue, 15 Sep 2020 17:56:01 -0700 Subject: [PATCH 06/10] fixed noncollienar issues in SlaterDetOps, BasePropagator and KP3 HOps --- .../KP3IndexFactorization.hpp | 96 ++++++++++++++----- src/AFQMC/Propagators/AFQMCBasePropagator.cpp | 1 + src/AFQMC/Propagators/AFQMCBasePropagator.icc | 37 ++++--- .../tests/test_propagator_factory.cpp | 11 ++- .../SlaterDetOperations_serial.hpp | 4 +- src/AFQMC/Wavefunctions/NOMSD.icc | 2 + 6 files changed, 97 insertions(+), 54 deletions(-) diff --git a/src/AFQMC/HamiltonianOperations/KP3IndexFactorization.hpp b/src/AFQMC/HamiltonianOperations/KP3IndexFactorization.hpp index 3f95758b0b..b5db2c8d2d 100644 --- a/src/AFQMC/HamiltonianOperations/KP3IndexFactorization.hpp +++ b/src/AFQMC/HamiltonianOperations/KP3IndexFactorization.hpp @@ -179,9 +179,7 @@ class KP3IndexFactorization { int nkpts = nopk.size(); int NMO = std::accumulate(nopk.begin(), nopk.end(), 0); - // in non-collinear case with SO, keep SO matrix here and add it - // for now, stay collinear - boost::multi::array P1({NMO, NMO}); + int npol = (walker_type == NONCOLLINEAR) ? 2 : 1; // making a copy of vMF since it will be modified shmCVector vMF_(iextensions<1u>{vMF.num_elements()}, shared_allocator{*comm}); @@ -193,45 +191,91 @@ class KP3IndexFactorization } comm->barrier(); - boost::multi::array_ref P1D(to_address(P1.origin()), {NMO * NMO}); - std::fill_n(P1D.origin(), P1D.num_elements(), ComplexType(0)); - vHS(vMF_, P1D); + boost::multi::array P0({NMO, NMO}); + boost::multi::array_ref P0D(to_address(P0.origin()), {P0.num_elements()}); + std::fill_n(P0D.origin(), P0D.num_elements(), ComplexType(0)); + vHS(vMF_, P0D); if (TG.TG().size() > 1) - TG.TG().all_reduce_in_place_n(P1D.origin(), P1D.num_elements(), std::plus<>()); + TG.TG().all_reduce_in_place_n(P0D.origin(), P0D.num_elements(), std::plus<>()); - // add H1 + vn0 and symmetrize - // MAM: intel 19 doesn't accept this using statement, I'm forced to - // call ma::conj explicitly even though I should not need to - using ma::conj; + boost::multi::array P1({npol*NMO, npol*NMO}); + std::fill_n(P1.origin(), P1.num_elements(), ComplexType(0.0)); + // add spin-dependent H1 for (int K = 0, nk0 = 0; K < nkpts; ++K) { for (int i = 0, I = nk0; i < nopk[K]; i++, I++) { - P1[I][I] += H1[K][i][i] + vn0[K][i][i]; + for(int p=0; p 1e-5) + for(int p=0; p 1e-6) + P1[p*NMO+I][p*NMO+J] += H1[K][p*nopk[K]+i][p*nopk[K]+j]; + P1[p*NMO+J][p*NMO+I] += H1[K][p*nopk[K]+j][p*nopk[K]+i]; + } + } + if(walker_type == NONCOLLINEAR) { + // offdiagonal piece + for (int j = 0, J = nk0; j < nopk[K]; j++, J++) { -#endif - app_error() << " WARNING in getOneBodyPropagatorMatrix. H1 is not hermitian. \n"; - app_error() << I << " " << J << " " << P1[I][J] << " " << P1[J][I] << " " << H1[K][i][j] << " " - << H1[K][j][i] << " " << vn0[K][i][j] << " " << vn0[K][j][i] << std::endl; - //APP_ABORT("Error in getOneBodyPropagatorMatrix. H1 is not hermitian. \n"); + P1[I][NMO+J] += H1[K][i][nopk[K]+j]; + P1[NMO+J][I] += H1[K][nopk[K]+j][i]; } - P1[I][J] = 0.5 * (P1[I][J] + ma::conj(P1[J][I])); - P1[J][I] = ma::conj(P1[I][J]); } } nk0 += nopk[K]; } + + // add P0 (diagonal in spin) + for(int p=0; p 1e-5) + { +#else + if (std::abs(P1[I][J] - ma::conj(P1[J][I])) * 2.0 > 1e-6) + { +#endif + app_error() << " WARNING in getOneBodyPropagatorMatrix. H1 is not hermitian. \n"; + app_error() << I << " " << J << " " << P1[I][J] << " " << P1[J][I] <start(); auto walker_type = wset.getWalkerType(); + int npol = (walker_type==NONCOLLINEAR) ? 2 : 1; int nsteps = nsteps_; int nwalk = wset.size(); RealType sqrtdt = std::sqrt(dt); @@ -265,6 +266,7 @@ void AFQMCBasePropagator::BackPropagate(int nbpsteps, int nStabalize, WlkSet& ws { using std::copy_n; auto walker_type = wset.getWalkerType(); + int npol = (walker_type==NONCOLLINEAR) ? 2 : 1; int nwalk = wset.size(); int globalnCV = wfn.global_number_of_cholesky_vectors(); @@ -285,7 +287,7 @@ void AFQMCBasePropagator::BackPropagate(int nbpsteps, int nStabalize, WlkSet& ws assert(Fields.size(1) == globalnCV); assert(Fields.size(2) == nwalk); - int nrow(NMO * ((walker_type == NONCOLLINEAR) ? 2 : 1)); + int nrow(NMO*npol); int ncol(NAEA + ((walker_type == CLOSED) ? 0 : NAEB)); assert(Refs.size(0) == nwalk); int nrefs = Refs.size(1); @@ -416,6 +418,7 @@ void AFQMCBasePropagator::apply_propagators(char TA, { int nwalk = wset.size(); auto walker_type = wset.getWalkerType(); + bool noncol = (walker_type == NONCOLLINEAR); int spin(0); if (spin_dependent_P1) @@ -455,28 +458,20 @@ void AFQMCBasePropagator::apply_propagators(char TA, for (int tk = tk0; tk < tkN; ++tk) { int nt = ni * nwalk + tk; - SDetOp->Propagate(*wset[tk].SlaterMatrix(Alpha), P1[0], vHS3D[nt], order, TA); + SDetOp->Propagate(*wset[tk].SlaterMatrix(Alpha), P1[0], vHS3D[nt], order, TA, noncol); } if (last_nextra > 0) { int iw = ntasks_total_serial + last_task_index; int nt = ni * nwalk + iw; - SDetOp->Propagate(*wset[iw].SlaterMatrix(Alpha), P1[0], vHS3D[nt], local_group_comm, order, TA); + SDetOp->Propagate(*wset[iw].SlaterMatrix(Alpha), P1[0], vHS3D[nt], local_group_comm, order, TA, noncol); } } } else { - if (walker_type == NONCOLLINEAR) - { - if (local_vHS.size(0) != 2 * NMO || local_vHS.size(1) != 2 * NMO) - local_vHS = std::move(CMatrix({2 * NMO, 2 * NMO})); - } - else - { - if (local_vHS.size(0) != NMO || local_vHS.size(1) != NMO) - local_vHS = std::move(CMatrix({NMO, NMO})); - } + if (local_vHS.size(0) != NMO || local_vHS.size(1) != NMO) + local_vHS = std::move(CMatrix({NMO, NMO})); // vHS3D[M][M][nstep*nwalk]: need temporary buffer in this case if (walker_type == COLLINEAR) { @@ -514,14 +509,14 @@ void AFQMCBasePropagator::apply_propagators(char TA, int nt = ni * nwalk + tk; local_vHS = vHS3D(local_vHS.extension(0), local_vHS.extension(1), nt); //std::cout<<" pp: " <Propagate(*wset[tk].SlaterMatrix(Alpha), P1[0], local_vHS, order, TA); + SDetOp->Propagate(*wset[tk].SlaterMatrix(Alpha), P1[0], local_vHS, order, TA, noncol); } if (last_nextra > 0) { int iw = ntasks_total_serial + last_task_index; int nt = ni * nwalk + iw; local_vHS = vHS3D(local_vHS.extension(0), local_vHS.extension(1), nt); - SDetOp->Propagate(*wset[iw].SlaterMatrix(Alpha), P1[0], local_vHS, local_group_comm, order, TA); + SDetOp->Propagate(*wset[iw].SlaterMatrix(Alpha), P1[0], local_vHS, local_group_comm, order, TA, noncol); } } } @@ -533,6 +528,7 @@ void AFQMCBasePropagator::apply_propagators_batched(char TA, WSet& wset, int ni, { int nwalk = wset.size(); auto walker_type = wset.getWalkerType(); + bool noncol = (walker_type == NONCOLLINEAR); int nbatch = std::min(nwalk, (nbatched_propagation < 0 ? nwalk : nbatched_propagation)); int spin(0); @@ -556,7 +552,7 @@ void AFQMCBasePropagator::apply_propagators_batched(char TA, WSet& wset, int ni, Ai.clear(); for (int ni = 0; ni < nb; ni++) Ai.emplace_back(wset[iw + ni].SlaterMatrix(Alpha)); - SDetOp->BatchedPropagate(Ai, P1[0], vHS3D.sliced(nt, nt + nb), order, TA); + SDetOp->BatchedPropagate(Ai, P1[0], vHS3D.sliced(nt, nt + nb), order, TA, noncol); if (walker_type == COLLINEAR) { Ai.clear(); @@ -568,13 +564,12 @@ void AFQMCBasePropagator::apply_propagators_batched(char TA, WSet& wset, int ni, } else { - int sz = (walker_type == NONCOLLINEAR ? 2 * NMO : NMO); - if (local_vHS.size(0) != nbatch || local_vHS.size(1) != sz * sz) - local_vHS = std::move(CMatrix({nbatch, sz * sz})); + if (local_vHS.size(0) != nbatch || local_vHS.size(1) != NMO * NMO) + local_vHS = std::move(CMatrix({nbatch, NMO * NMO})); // vHS3D[M][M][nstep*nwalk]: need temporary buffer in this case int N2 = vHS3D.size(0) * vHS3D.size(1); CMatrix_ref vHS2D(vHS3D.origin(), {N2, vHS3D.size(2)}); - C3Tensor_ref local3D(local_vHS.origin(), {nbatch, sz, sz}); + C3Tensor_ref local3D(local_vHS.origin(), {nbatch, NMO, NMO}); int nt = ni * nwalk; for (int iw = 0; iw < nwalk; iw += nbatch, nt += nbatch) { @@ -583,7 +578,7 @@ void AFQMCBasePropagator::apply_propagators_batched(char TA, WSet& wset, int ni, Ai.clear(); for (int ni = 0; ni < nb; ni++) Ai.emplace_back(wset[iw + ni].SlaterMatrix(Alpha)); - SDetOp->BatchedPropagate(Ai, P1[0], local3D.sliced(0, nb), order, TA); + SDetOp->BatchedPropagate(Ai, P1[0], local3D.sliced(0, nb), order, TA, noncol); if (walker_type == COLLINEAR) { Ai.clear(); diff --git a/src/AFQMC/Propagators/tests/test_propagator_factory.cpp b/src/AFQMC/Propagators/tests/test_propagator_factory.cpp index 77cd3e6944..4ca73a30fe 100644 --- a/src/AFQMC/Propagators/tests/test_propagator_factory.cpp +++ b/src/AFQMC/Propagators/tests/test_propagator_factory.cpp @@ -78,6 +78,8 @@ void propg_fac_shared(boost::mpi3::communicator& world) int NMO, NAEA, NAEB; std::tie(NMO, NAEA, NAEB) = read_info_from_hdf(UTEST_HAMIL); + WALKER_TYPES type = afqmc::getWalkerType(UTEST_WFN); + int NPOL = (type == NONCOLLINEAR) ? 2 : 1; std::map InfoMap; InfoMap.insert(std::pair("info0", AFQMCInfo{"info0", NMO, NAEA, NAEB})); @@ -104,7 +106,6 @@ void propg_fac_shared(boost::mpi3::communicator& world) // initialize TG buffer make_localTG_buffer_generator(TG.TG_local(), 20 * 1024L * 1024L); - WALKER_TYPES type = afqmc::getWalkerType(UTEST_WFN); const char* wlk_xml_block_closed = " \ closed \ \ @@ -148,7 +149,7 @@ void propg_fac_shared(boost::mpi3::communicator& world) WalkerSet wset(TG, doc3.getRoot(), InfoMap["info0"], &rng); auto initial_guess = WfnFac.getInitialGuess(wfn_name); REQUIRE(initial_guess.size(0) == 2); - REQUIRE(initial_guess.size(1) == NMO); + REQUIRE(initial_guess.size(1) == NPOL*NMO); REQUIRE(initial_guess.size(2) == NAEA); wset.resize(nwalk, initial_guess[0], initial_guess[0]); // initial_guess[1](XXX.extension(0),{0,NAEB})); @@ -165,7 +166,6 @@ void propg_fac_shared(boost::mpi3::communicator& world) Propagator& prop = PropgFac.getPropagator(TG, prop_name, wfn, &rng); std::cout << setprecision(12); - std::cout << "energy " << std::endl; wfn.Energy(wset); { ComplexType eav = 0, ov = 0; @@ -260,6 +260,8 @@ void propg_fac_distributed(boost::mpi3::communicator& world, int ngrp) int NMO, NAEA, NAEB; std::tie(NMO, NAEA, NAEB) = read_info_from_hdf(UTEST_HAMIL); + WALKER_TYPES type = afqmc::getWalkerType(UTEST_WFN); + int NPOL = (type == NONCOLLINEAR) ? 2 : 1; std::map InfoMap; InfoMap.insert(std::pair("info0", AFQMCInfo{"info0", NMO, NAEA, NAEB})); @@ -289,7 +291,6 @@ void propg_fac_distributed(boost::mpi3::communicator& world, int ngrp) // initialize TG buffer make_localTG_buffer_generator(TG.TG_local(), 20 * 1024L * 1024L); - WALKER_TYPES type = afqmc::getWalkerType(UTEST_WFN); const char* wlk_xml_block_closed = " \ closed \ \ @@ -330,7 +331,7 @@ void propg_fac_distributed(boost::mpi3::communicator& world, int ngrp) WalkerSet wset(TG, doc3.getRoot(), InfoMap["info0"], &rng); auto initial_guess = WfnFac.getInitialGuess(wfn_name); REQUIRE(initial_guess.size(0) == 2); - REQUIRE(initial_guess.size(1) == NMO); + REQUIRE(initial_guess.size(1) == NPOL*NMO); REQUIRE(initial_guess.size(2) == NAEA); wset.resize(nwalk, initial_guess[0], initial_guess[0]); diff --git a/src/AFQMC/SlaterDeterminantOperations/SlaterDetOperations_serial.hpp b/src/AFQMC/SlaterDeterminantOperations/SlaterDetOperations_serial.hpp index fca3c36ae2..73ff2e0ba6 100644 --- a/src/AFQMC/SlaterDeterminantOperations/SlaterDetOperations_serial.hpp +++ b/src/AFQMC/SlaterDeterminantOperations/SlaterDetOperations_serial.hpp @@ -127,12 +127,12 @@ class SlaterDetOperations_serial : public SlaterDetOperations_base - void Propagate(Mat&& A, const MatP1& P1, const MatV& V, communicator& comm, int order = 6, char TA = 'N') + void Propagate(Mat&& A, const MatP1& P1, const MatV& V, communicator& comm, int order = 6, char TA = 'N', bool noncollinear = false) { #if defined(ENABLE_CUDA) || defined(ENABLE_HIP) APP_ABORT(" Error: SlaterDetOperations_serial should not be here. \n"); #endif - Base::Propagate(std::forward(A), P1, V, order, TA); + Base::Propagate(std::forward(A), P1, V, order, TA, noncollinear); } template diff --git a/src/AFQMC/Wavefunctions/NOMSD.icc b/src/AFQMC/Wavefunctions/NOMSD.icc index 5ad40ac542..e970967620 100644 --- a/src/AFQMC/Wavefunctions/NOMSD.icc +++ b/src/AFQMC/Wavefunctions/NOMSD.icc @@ -2314,6 +2314,8 @@ void NOMSD::vMF(Vec&& v) template inline void NOMSD::recompute_ci() { + if (walker_type == NONCOLLINEAR) + APP_ABORT(" Error: NOMSD::recompute_ci not implemented with NONCOLLINEAR.\n"); double LogOverlapFactor(0.0); int ndets = ci.size(); int nspin(1); From cc45a76609a2a870f157121821962525efac9fed Mon Sep 17 00:00:00 2001 From: mmorale3 Date: Wed, 16 Sep 2020 09:39:44 -0700 Subject: [PATCH 07/10] fixed issues on gpu with recent noncollinear changes, all unit tests pass non-collinear KP input --- .../boost_multi/multi/memory/allocator.hpp | 3 +- .../KP3IndexFactorization_batched.hpp | 147 ++++++++++++------ .../Hamiltonians/KPFactorizedHamiltonian.cpp | 4 +- .../detail/CUDA/Kernels/KaKjw_to_KKwaj.cu | 58 ++++--- .../detail/CUDA/Kernels/KaKjw_to_KKwaj.cuh | 6 + .../detail/CUDA/Kernels/KaKjw_to_QKajw.cu | 52 ++++--- .../detail/CUDA/Kernels/KaKjw_to_QKajw.cuh | 6 + .../detail/HIP/Kernels/KaKjw_to_KKwaj.hip.cpp | 56 ++++--- .../detail/HIP/Kernels/KaKjw_to_KKwaj.hip.h | 6 + .../detail/HIP/Kernels/KaKjw_to_QKajw.hip.cpp | 52 ++++--- .../detail/HIP/Kernels/KaKjw_to_QKajw.hip.h | 6 + src/AFQMC/Numerics/tensor_operations.hpp | 44 ++++-- .../Numerics/tests/test_tensor_operations.cpp | 4 +- .../SlaterDetOperations_serial.hpp | 2 +- .../apply_expM.hpp | 84 ++++++++++ .../SlaterDeterminantOperations/rotate.hpp | 2 +- 16 files changed, 374 insertions(+), 158 deletions(-) diff --git a/external_codes/boost_multi/multi/memory/allocator.hpp b/external_codes/boost_multi/multi/memory/allocator.hpp index d10f9d86f5..2e93733cb4 100644 --- a/external_codes/boost_multi/multi/memory/allocator.hpp +++ b/external_codes/boost_multi/multi/memory/allocator.hpp @@ -49,7 +49,8 @@ class allocator{ bool operator!=(allocator const& o) const{return mp_ != o.mp_;} using void_pointer = typename std::pointer_traits()->allocate(0, 0))>::template rebind; pointer allocate(size_type n){ - return static_cast(static_cast(mp_->allocate(n*sizeof(value_type), alignof(T)))); + return static_cast(static_cast(mp_->allocate(n*sizeof(value_type), 16))); + //return static_cast(static_cast(mp_->allocate(n*sizeof(value_type), alignof(T)))); } void deallocate(pointer p, size_type n){ mp_->deallocate(p, n*sizeof(value_type)); diff --git a/src/AFQMC/HamiltonianOperations/KP3IndexFactorization_batched.hpp b/src/AFQMC/HamiltonianOperations/KP3IndexFactorization_batched.hpp index 270071a365..5d20c41780 100644 --- a/src/AFQMC/HamiltonianOperations/KP3IndexFactorization_batched.hpp +++ b/src/AFQMC/HamiltonianOperations/KP3IndexFactorization_batched.hpp @@ -82,6 +82,8 @@ class KP3IndexFactorization_batched using CMatrix_cref = boost::multi::array_ref; using CVector_ref = ComplexVector_ref; using CMatrix_ref = ComplexMatrix_ref; + using C3Tensor_ref = Complex3Tensor_ref; + using C4Tensor_ref = ComplexArray_ref<4, pointer>; using C3Tensor_cref = boost::multi::array_ref; using SpMatrix_cref = boost::multi::array_ref; @@ -338,50 +340,96 @@ class KP3IndexFactorization_batched { int nkpts = nopk.size(); int NMO = std::accumulate(nopk.begin(), nopk.end(), 0); - // in non-collinear case with SO, keep SO matrix here and add it - // for now, stay collinear + int npol = (walker_type == NONCOLLINEAR) ? 2 : 1; CVector vMF_(vMF); - CVector P1D(iextensions<1u>{NMO * NMO}); - fill_n(P1D.origin(), P1D.num_elements(), ComplexType(0)); - vHS(vMF_, P1D); + CVector P0D(iextensions<1u>{NMO * NMO}); + fill_n(P0D.origin(), P0D.num_elements(), ComplexType(0)); + vHS(vMF_, P0D); if (TG_.TG().size() > 1) - TG_.TG().all_reduce_in_place_n(to_address(P1D.origin()), P1D.num_elements(), std::plus<>()); + TG_.TG().all_reduce_in_place_n(to_address(P0D.origin()), P0D.num_elements(), std::plus<>()); - boost::multi::array P1({NMO, NMO}); - copy_n(P1D.origin(), NMO * NMO, P1.origin()); + boost::multi::array P0({NMO, NMO}); + copy_n(P0D.origin(), NMO * NMO, P0.origin()); - // add H1 + vn0 and symmetrize - using ma::conj; + boost::multi::array P1({npol*NMO, npol*NMO}); + std::fill_n(P1.origin(), P1.num_elements(), ComplexType(0.0)); + // add spin-dependent H1 + for (int K = 0, nk0 = 0; K < nkpts; ++K) + { + for (int i = 0, I = nk0; i < nopk[K]; i++, I++) + { + for(int p=0; p 1e-5) + for(int p=0; p 1e-6) - { -#endif - app_error() << " WARNING in getOneBodyPropagatorMatrix. H1 is not hermitian. \n"; - app_error() << I << " " << J << " " << P1[I][J] << " " << P1[J][I] << " " << H1[K][i][j] << " " - << H1[K][j][i] << " " << vn0[K][i][j] << " " << vn0[K][j][i] << std::endl; - //APP_ABORT("Error in getOneBodyPropagatorMatrix. H1 is not hermitian. \n"); + P1[p*NMO+I][p*NMO+J] += vn0[K][i][j]; + P1[p*NMO+J][p*NMO+I] += vn0[K][j][i]; } - P1[I][J] = 0.5 * (P1[I][J] + ma::conj(P1[J][I])); - P1[J][I] = ma::conj(P1[I][J]); } } nk0 += nopk[K]; } + + using ma::conj; + // symmetrize + for(int I=0; I 1e-5) + { +#else + if (std::abs(P1[I][J] - ma::conj(P1[J][I])) * 2.0 > 1e-6) + { +#endif + app_error() << " WARNING in getOneBodyPropagatorMatrix. H1 is not hermitian. \n"; + app_error() << I << " " << J << " " << P1[I][J] << " " << P1[J][I] <template get_allocator()); GKaKjw_to_GKKwaj(G3Da, GKK[0], nelpk[nd].sliced(0, nkpts), dev_nelpk[nd], dev_a0pk[nd]); if (walker_type == COLLINEAR) @@ -519,10 +568,11 @@ class KP3IndexFactorization_batched for (int K = 0; K < nkpts; ++K) { #if defined(MIXED_PRECISION) - CMatrix_ref haj_K(make_device_ptr(haj[nd * nkpts + K].origin()), {nocc_max, nmo_max}); + C3Tensor_ref haj_K(make_device_ptr(haj[nd * nkpts + K].origin()), {nocc_max, npol, nmo_max}); for (int a = 0; a < nelpk[nd][K]; ++a) - ma::product(ComplexType(1.), ma::T(G3Da[na + a].sliced(nk, nk + nopk[K])), haj_K[a].sliced(0, nopk[K]), - ComplexType(1.), E({0, nwalk}, 0)); + for (int pol = 0; pol < npol; ++pol) + ma::product(ComplexType(1.), ma::T(G3Da[(na + a)*npol+p].sliced(nk, nk + nopk[K])), + haj_K[a][pol].sliced(0, nopk[K]), ComplexType(1.), E({0, nwalk}, 0)); na += nelpk[nd][K]; if (walker_type == COLLINEAR) { @@ -538,8 +588,8 @@ class KP3IndexFactorization_batched nk = nopk[K]; { na = nelpk[nd][K]; - CVector_ref haj_K(make_device_ptr(haj[nd * nkpts + K].origin()), {nocc_max * nmo_max}); - SpMatrix_ref Gaj(GKK[0][K][K].origin(), {nwalk, nocc_max * nmo_max}); + CVector_ref haj_K(make_device_ptr(haj[nd * nkpts + K].origin()), {nocc_max * npol * nmo_max}); + SpMatrix_ref Gaj(GKK[0][K][K].origin(), {nwalk, nocc_max * npol * nmo_max}); ma::product(ComplexType(1.), Gaj, haj_K, ComplexType(1.), E({0, nwalk}, 0)); } if (walker_type == COLLINEAR) @@ -582,7 +632,7 @@ class KP3IndexFactorization_batched // I WANT C++17!!!!!! long mem_ank(0); if (needs_copy) - mem_ank = nkpts * nocc_max * nchol_max * nmo_max; + mem_ank = nkpts * nocc_max * nchol_max * npol * nmo_max; StaticVector LBuff(iextensions<1u>{2 * mem_ank}, device_buffer_allocator->template get_allocator()); sp_pointer LQptr(nullptr), LQmptr(nullptr); @@ -660,8 +710,8 @@ class KP3IndexFactorization_batched if (batch_cnt >= batch_size) { - gemmBatched('T', 'N', nocc_max * nchol_max, nwalk * nocc_max, nmo_max, SPComplexType(1.0), - Aarray.data(), nmo_max, Barray.data(), nmo_max, SPComplexType(0.0), Carray.data(), + gemmBatched('T', 'N', nocc_max * nchol_max, nwalk * nocc_max, npol*nmo_max, SPComplexType(1.0), + Aarray.data(), npol*nmo_max, Barray.data(), npol*nmo_max, SPComplexType(0.0), Carray.data(), nocc_max * nchol_max, Aarray.size()); copy_n(scl_factors.data(), scl_factors.size(), dev_scl_factors.origin()); @@ -691,8 +741,8 @@ class KP3IndexFactorization_batched if (batch_cnt > 0) { - gemmBatched('T', 'N', nocc_max * nchol_max, nwalk * nocc_max, nmo_max, SPComplexType(1.0), Aarray.data(), - nmo_max, Barray.data(), nmo_max, SPComplexType(0.0), Carray.data(), nocc_max * nchol_max, + gemmBatched('T', 'N', nocc_max * nchol_max, nwalk * nocc_max, npol*nmo_max, SPComplexType(1.0), Aarray.data(), + npol*nmo_max, Barray.data(), npol*nmo_max, SPComplexType(0.0), Carray.data(), nocc_max * nchol_max, Aarray.size()); copy_n(scl_factors.data(), scl_factors.size(), dev_scl_factors.origin()); @@ -1322,6 +1372,7 @@ class KP3IndexFactorization_batched assert(v.size(0) == 2 * local_nCV); assert(v.size(1) == nwalk); int nspin = (walker_type == COLLINEAR ? 2 : 1); + int npol = (walker_type == NONCOLLINEAR ? 2 : 1); int nmo_tot = std::accumulate(nopk.begin(), nopk.end(), 0); int nmo_max = *std::max_element(nopk.begin(), nopk.end()); int nocca_tot = std::accumulate(nelpk[nd].begin(), nelpk[nd].begin() + nkpts, 0); @@ -1340,11 +1391,11 @@ class KP3IndexFactorization_batched SPComplexType minusimhalfa(0.0, -0.5 * a * scl); SPComplexType imhalfa(0.0, 0.5 * a * scl); - assert(G.num_elements() == nwalk * (nocca_tot + noccb_tot) * nmo_tot); + assert(G.num_elements() == nwalk * (nocca_tot + noccb_tot) * npol * nmo_tot); // MAM: use reshape when available, then no need to deal with types using GType = typename std::decay::type::element; boost::multi::array_ref G3Da(make_device_ptr(G.origin()), - {nocca_tot, nmo_tot, nwalk}); + {nocca_tot*npol, nmo_tot, nwalk}); boost::multi::array_ref G3Db(make_device_ptr(G.origin()) + G3Da.num_elements() * (nspin - 1), @@ -1358,7 +1409,7 @@ class KP3IndexFactorization_batched size_t cnt(0); Static3Tensor v1({nkpts + number_of_symmetric_Q, nchol_max, nwalk}, device_buffer_allocator->template get_allocator()); - Static3Tensor GQ({nkpts, nkpts * nocc_max * nmo_max, nwalk}, + Static3Tensor GQ({nkpts, nkpts * nocc_max * npol * nmo_max, nwalk}, device_buffer_allocator->template get_allocator()); fill_n(v1.origin(), v1.num_elements(), SPComplexType(0.0)); fill_n(GQ.origin(), GQ.num_elements(), SPComplexType(0.0)); @@ -1370,7 +1421,7 @@ class KP3IndexFactorization_batched dev_a0pk[nd].sliced(nkpts, 2 * nkpts)); // can use productStridedBatched if LQKakn is changed to a 3Tensor array - int Kak = nkpts * nocc_max * nmo_max; + int Kak = nkpts * nocc_max * npol * nmo_max; std::vector Aarray; std::vector Barray; std::vector Carray; @@ -1530,30 +1581,32 @@ class KP3IndexFactorization_batched template void GKaKjw_to_GKKwaj(MatA const& GKaKj, MatB&& GKKaj, IVec&& nocc, IVec2&& dev_no, IVec2&& dev_a0) { + int npol = (walker_type == NONCOLLINEAR) ? 2 : 1; int nmo_max = *std::max_element(nopk.begin(), nopk.end()); // int nocc_max = *std::max_element(nocc.begin(),nocc.end()); int nmo_tot = GKaKj.size(1); int nwalk = GKaKj.size(2); int nkpts = nopk.size(); - assert(GKKaj.num_elements() >= nkpts * nkpts * nwalk * nocc_max * nmo_max); + assert(GKKaj.num_elements() >= nkpts * nkpts * nwalk * nocc_max * npol * nmo_max); using ma::KaKjw_to_KKwaj; - KaKjw_to_KKwaj(nwalk, nkpts, nmo_max, nmo_tot, nocc_max, dev_nopk.origin(), dev_i0pk.origin(), dev_no.origin(), + KaKjw_to_KKwaj(nwalk, nkpts, npol, nmo_max, nmo_tot, nocc_max, dev_nopk.origin(), dev_i0pk.origin(), dev_no.origin(), dev_a0.origin(), GKaKj.origin(), GKKaj.origin()); } template void GKaKjw_to_GQKajw(MatA const& GKaKj, MatB&& GQKaj, IVec&& nocc, IVec2&& dev_no, IVec2&& dev_a0) { + int npol = (walker_type == NONCOLLINEAR) ? 2 : 1; int nmo_max = *std::max_element(nopk.begin(), nopk.end()); // int nocc_max = *std::max_element(nocc.begin(),nocc.end()); int nmo_tot = GKaKj.size(1); int nwalk = GKaKj.size(2); int nkpts = nopk.size(); - assert(GQKaj.num_elements() >= nkpts * nkpts * nwalk * nocc_max * nmo_max); + assert(GQKaj.num_elements() >= nkpts * nkpts * nwalk * nocc_max * npol * nmo_max); using ma::KaKjw_to_QKajw; - KaKjw_to_QKajw(nwalk, nkpts, nmo_max, nmo_tot, nocc_max, dev_nopk.origin(), dev_i0pk.origin(), dev_no.origin(), + KaKjw_to_QKajw(nwalk, nkpts, npol, nmo_max, nmo_tot, nocc_max, dev_nopk.origin(), dev_i0pk.origin(), dev_no.origin(), dev_a0.origin(), dev_QKToK2.origin(), GKaKj.origin(), GQKaj.origin()); } diff --git a/src/AFQMC/Hamiltonians/KPFactorizedHamiltonian.cpp b/src/AFQMC/Hamiltonians/KPFactorizedHamiltonian.cpp index 27b9a7ee72..ff281cd796 100644 --- a/src/AFQMC/Hamiltonians/KPFactorizedHamiltonian.cpp +++ b/src/AFQMC/Hamiltonians/KPFactorizedHamiltonian.cpp @@ -1143,7 +1143,7 @@ HamiltonianOperations KPFactorizedHamiltonian::getHamiltonianOperations_batched( for (int nd = 0; nd < ndet; nd++) { for (int Q = 0; Q < number_of_symmetric_Q; Q++) - LQKbnl.emplace_back(shmSpMatrix({nkpts, ank_max}, shared_allocator{distNode})); + LQKbnl.emplace_back(shmSpMatrix({nkpts, npol*ank_max}, shared_allocator{distNode})); if (type == COLLINEAR) { for (int Q = 0; Q < number_of_symmetric_Q; Q++) @@ -1161,7 +1161,7 @@ HamiltonianOperations KPFactorizedHamiltonian::getHamiltonianOperations_batched( { for (int Q = 0; Q < number_of_symmetric_Q; Q++) { - LQKbln.emplace_back(shmSpMatrix({nkpts, ank_max}, shared_allocator{distNode})); + LQKbln.emplace_back(shmSpMatrix({nkpts, npol*ank_max}, shared_allocator{distNode})); } if (type == COLLINEAR) { diff --git a/src/AFQMC/Numerics/detail/CUDA/Kernels/KaKjw_to_KKwaj.cu b/src/AFQMC/Numerics/detail/CUDA/Kernels/KaKjw_to_KKwaj.cu index acdc06c653..45c56c7c90 100644 --- a/src/AFQMC/Numerics/detail/CUDA/Kernels/KaKjw_to_KKwaj.cu +++ b/src/AFQMC/Numerics/detail/CUDA/Kernels/KaKjw_to_KKwaj.cu @@ -24,11 +24,12 @@ namespace kernels { // very sloppy, needs improvement!!!! -// A[nocc_tot][nmo_tot][nwalk] -// B[Ka][Kj][nwalk][nocc_max][nmo_max] +// A[nocc_tot][s][nmo_tot][nwalk] +// B[Ka][Kj][nwalk][nocc_max][s][nmo_max] template __global__ void kernel_KaKjw_to_KKwaj(int nwalk, int nkpts, + int npol, int nmo_max, int nmo_tot, int nocc_max, @@ -41,30 +42,32 @@ __global__ void kernel_KaKjw_to_KKwaj(int nwalk, { int Ka = blockIdx.x; int Kj = blockIdx.y; - if (Ka >= nkpts || Kj >= nkpts) + int pol = blockIdx.z; + if (Ka >= nkpts || Kj >= nkpts || pol >= npol) return; int na0 = nocc0[Ka]; int nj0 = nmo0[Kj]; int na = nocc[Ka]; int nj = nmo[Kj]; - T const* A_(A + (na0 * nmo_tot + nj0) * nwalk); - T2* B_(B + ((Ka * nkpts + Kj) * nocc_max) * nmo_max * nwalk); + T const* A_(A + (na0 * npol * nmo_tot + nj0) * nwalk); + T2* B_(B + ((Ka * nkpts + Kj) * nocc_max) * npol * nmo_max * nwalk); if (threadIdx.x >= nj) return; if (threadIdx.y >= nwalk) return; - for (int a = 0, a1 = 0; a < na; a++, a1 += nmo_tot * nwalk) + for (int a = 0, a1 = pol * nmo_tot * nwalk; a < na; a++, a1 += npol * nmo_tot * nwalk) for (int j = threadIdx.x; j < nj; j += blockDim.x) for (int n = threadIdx.y; n < nwalk; n += blockDim.y) - B_[n * nocc_max * nmo_max + a * nmo_max + j] = static_cast(A_[a1 + j * nwalk + n]); + B_[((n * nocc_max + a) * npol + pol) * nmo_max + j] = static_cast(A_[a1 + j * nwalk + n]); } template __global__ void kernel_KaKjw_to_KKwaj(int nwalk, int nkpts, + int npol, int nmo_max, int nmo_tot, int nocc_max, @@ -77,30 +80,32 @@ __global__ void kernel_KaKjw_to_KKwaj(int nwalk, { int Ka = blockIdx.x; int Kj = blockIdx.y; - if (Ka >= nkpts || Kj >= nkpts) + int pol = blockIdx.z; + if (Ka >= nkpts || Kj >= nkpts || pol >= npol) return; int na0 = nocc0[Ka]; int nj0 = nmo0[Kj]; int na = nocc[Ka]; int nj = nmo[Kj]; - thrust::complex const* A_(A + (na0 * nmo_tot + nj0) * nwalk); - thrust::complex* B_(B + ((Ka * nkpts + Kj) * nocc_max) * nmo_max * nwalk); + thrust::complex const* A_(A + (na0 * npol * nmo_tot + nj0) * nwalk); + thrust::complex* B_(B + ((Ka * nkpts + Kj) * nocc_max) * npol * nmo_max * nwalk); if (threadIdx.x >= nj) return; if (threadIdx.y >= nwalk) return; - for (int a = 0, a1 = 0; a < na; a++, a1 += nmo_tot * nwalk) + for (int a = 0, a1 = pol * nmo_tot * nwalk; a < na; a++, a1 += npol * nmo_tot * nwalk) for (int j = threadIdx.x; j < nj; j += blockDim.x) for (int n = threadIdx.y; n < nwalk; n += blockDim.y) - B_[n * nocc_max * nmo_max + a * nmo_max + j] = static_cast>(A_[a1 + j * nwalk + n]); + B_[((n * nocc_max + a) * npol + pol) * nmo_max + j] = static_cast>(A_[a1 + j * nwalk + n]); } void KaKjw_to_KKwaj(int nwalk, int nkpts, + int npol, int nmo_max, int nmo_tot, int nocc_max, @@ -114,8 +119,8 @@ void KaKjw_to_KKwaj(int nwalk, int xblock_dim = 16; int yblock_dim = std::min(nwalk, 32); dim3 block_dim(xblock_dim, yblock_dim, 1); - dim3 grid_dim(nkpts, nkpts, 1); - kernel_KaKjw_to_KKwaj<<>>(nwalk, nkpts, nmo_max, nmo_tot, nocc_max, nmo, nmo0, nocc, nocc0, A, + dim3 grid_dim(nkpts, nkpts, npol); + kernel_KaKjw_to_KKwaj<<>>(nwalk, nkpts, npol, nmo_max, nmo_tot, nocc_max, nmo, nmo0, nocc, nocc0, A, B); qmc_cuda::cuda_check(cudaGetLastError(), "KaKjw_to_KKwaj"); qmc_cuda::cuda_check(cudaDeviceSynchronize(), "KaKjw_to_KKwaj"); @@ -123,6 +128,7 @@ void KaKjw_to_KKwaj(int nwalk, void KaKjw_to_KKwaj(int nwalk, int nkpts, + int npol, int nmo_max, int nmo_tot, int nocc_max, @@ -136,8 +142,8 @@ void KaKjw_to_KKwaj(int nwalk, int xblock_dim = 16; int yblock_dim = std::min(nwalk, 32); dim3 block_dim(xblock_dim, yblock_dim, 1); - dim3 grid_dim(nkpts, nkpts, 1); - kernel_KaKjw_to_KKwaj<<>>(nwalk, nkpts, nmo_max, nmo_tot, nocc_max, nmo, nmo0, nocc, nocc0, A, + dim3 grid_dim(nkpts, nkpts, npol); + kernel_KaKjw_to_KKwaj<<>>(nwalk, nkpts, npol, nmo_max, nmo_tot, nocc_max, nmo, nmo0, nocc, nocc0, A, B); qmc_cuda::cuda_check(cudaGetLastError(), "KaKjw_to_KKwaj"); qmc_cuda::cuda_check(cudaDeviceSynchronize(), "KaKjw_to_KKwaj"); @@ -145,6 +151,7 @@ void KaKjw_to_KKwaj(int nwalk, void KaKjw_to_KKwaj(int nwalk, int nkpts, + int npol, int nmo_max, int nmo_tot, int nocc_max, @@ -158,8 +165,8 @@ void KaKjw_to_KKwaj(int nwalk, int xblock_dim = 16; int yblock_dim = std::min(nwalk, 32); dim3 block_dim(xblock_dim, yblock_dim, 1); - dim3 grid_dim(nkpts, nkpts, 1); - kernel_KaKjw_to_KKwaj<<>>(nwalk, nkpts, nmo_max, nmo_tot, nocc_max, nmo, nmo0, nocc, nocc0, A, + dim3 grid_dim(nkpts, nkpts, npol); + kernel_KaKjw_to_KKwaj<<>>(nwalk, nkpts, npol, nmo_max, nmo_tot, nocc_max, nmo, nmo0, nocc, nocc0, A, B); qmc_cuda::cuda_check(cudaGetLastError(), "KaKjw_to_KKwaj"); qmc_cuda::cuda_check(cudaDeviceSynchronize(), "KaKjw_to_KKwaj"); @@ -167,6 +174,7 @@ void KaKjw_to_KKwaj(int nwalk, void KaKjw_to_KKwaj(int nwalk, int nkpts, + int npol, int nmo_max, int nmo_tot, int nocc_max, @@ -180,8 +188,8 @@ void KaKjw_to_KKwaj(int nwalk, int xblock_dim = 16; int yblock_dim = std::min(nwalk, 32); dim3 block_dim(xblock_dim, yblock_dim, 1); - dim3 grid_dim(nkpts, nkpts, 1); - kernel_KaKjw_to_KKwaj<<>>(nwalk, nkpts, nmo_max, nmo_tot, nocc_max, nmo, nmo0, nocc, nocc0, + dim3 grid_dim(nkpts, nkpts, npol); + kernel_KaKjw_to_KKwaj<<>>(nwalk, nkpts, npol, nmo_max, nmo_tot, nocc_max, nmo, nmo0, nocc, nocc0, reinterpret_cast const*>(A), reinterpret_cast*>(B)); qmc_cuda::cuda_check(cudaGetLastError(), "KaKjw_to_KKwaj"); @@ -190,6 +198,7 @@ void KaKjw_to_KKwaj(int nwalk, void KaKjw_to_KKwaj(int nwalk, int nkpts, + int npol, int nmo_max, int nmo_tot, int nocc_max, @@ -203,8 +212,8 @@ void KaKjw_to_KKwaj(int nwalk, int xblock_dim = 16; int yblock_dim = std::min(nwalk, 32); dim3 block_dim(xblock_dim, yblock_dim, 1); - dim3 grid_dim(nkpts, nkpts, 1); - kernel_KaKjw_to_KKwaj<<>>(nwalk, nkpts, nmo_max, nmo_tot, nocc_max, nmo, nmo0, nocc, nocc0, + dim3 grid_dim(nkpts, nkpts, npol); + kernel_KaKjw_to_KKwaj<<>>(nwalk, nkpts, npol, nmo_max, nmo_tot, nocc_max, nmo, nmo0, nocc, nocc0, reinterpret_cast const*>(A), reinterpret_cast*>(B)); qmc_cuda::cuda_check(cudaGetLastError(), "KaKjw_to_KKwaj"); @@ -213,6 +222,7 @@ void KaKjw_to_KKwaj(int nwalk, void KaKjw_to_KKwaj(int nwalk, int nkpts, + int npol, int nmo_max, int nmo_tot, int nocc_max, @@ -226,8 +236,8 @@ void KaKjw_to_KKwaj(int nwalk, int xblock_dim = 16; int yblock_dim = std::min(nwalk, 32); dim3 block_dim(xblock_dim, yblock_dim, 1); - dim3 grid_dim(nkpts, nkpts, 1); - kernel_KaKjw_to_KKwaj<<>>(nwalk, nkpts, nmo_max, nmo_tot, nocc_max, nmo, nmo0, nocc, nocc0, + dim3 grid_dim(nkpts, nkpts, npol); + kernel_KaKjw_to_KKwaj<<>>(nwalk, nkpts, npol, nmo_max, nmo_tot, nocc_max, nmo, nmo0, nocc, nocc0, reinterpret_cast const*>(A), reinterpret_cast*>(B)); qmc_cuda::cuda_check(cudaGetLastError(), "KaKjw_to_KKwaj"); diff --git a/src/AFQMC/Numerics/detail/CUDA/Kernels/KaKjw_to_KKwaj.cuh b/src/AFQMC/Numerics/detail/CUDA/Kernels/KaKjw_to_KKwaj.cuh index 0553530836..cb2ee706b5 100644 --- a/src/AFQMC/Numerics/detail/CUDA/Kernels/KaKjw_to_KKwaj.cuh +++ b/src/AFQMC/Numerics/detail/CUDA/Kernels/KaKjw_to_KKwaj.cuh @@ -23,6 +23,7 @@ namespace kernels { void KaKjw_to_KKwaj(int nwalk, int nkpts, + int npol, int nmo_max, int nmo_tot, int nocc_max, @@ -34,6 +35,7 @@ void KaKjw_to_KKwaj(int nwalk, double* B); void KaKjw_to_KKwaj(int nwalk, int nkpts, + int npol, int nmo_max, int nmo_tot, int nocc_max, @@ -45,6 +47,7 @@ void KaKjw_to_KKwaj(int nwalk, float* B); void KaKjw_to_KKwaj(int nwalk, int nkpts, + int npol, int nmo_max, int nmo_tot, int nocc_max, @@ -56,6 +59,7 @@ void KaKjw_to_KKwaj(int nwalk, float* B); void KaKjw_to_KKwaj(int nwalk, int nkpts, + int npol, int nmo_max, int nmo_tot, int nocc_max, @@ -67,6 +71,7 @@ void KaKjw_to_KKwaj(int nwalk, std::complex* B); void KaKjw_to_KKwaj(int nwalk, int nkpts, + int npol, int nmo_max, int nmo_tot, int nocc_max, @@ -78,6 +83,7 @@ void KaKjw_to_KKwaj(int nwalk, std::complex* B); void KaKjw_to_KKwaj(int nwalk, int nkpts, + int npol, int nmo_max, int nmo_tot, int nocc_max, diff --git a/src/AFQMC/Numerics/detail/CUDA/Kernels/KaKjw_to_QKajw.cu b/src/AFQMC/Numerics/detail/CUDA/Kernels/KaKjw_to_QKajw.cu index 92f84e1675..f6527b0e72 100644 --- a/src/AFQMC/Numerics/detail/CUDA/Kernels/KaKjw_to_QKajw.cu +++ b/src/AFQMC/Numerics/detail/CUDA/Kernels/KaKjw_to_QKajw.cu @@ -29,6 +29,7 @@ namespace kernels template __global__ void kernel_KaKjw_to_QKajw(int nwalk, int nkpts, + int npol, int nmo_max, int nmo_tot, int nocc_max, @@ -42,7 +43,8 @@ __global__ void kernel_KaKjw_to_QKajw(int nwalk, { int Q = blockIdx.x; int K = blockIdx.y; - if (Q >= nkpts || K >= nkpts) + int pol = blockIdx.z; + if (Q >= nkpts || K >= nkpts || pol > npol) return; int QK = QKtok2[Q * nkpts + K]; int na0 = nocc0[K]; @@ -50,15 +52,16 @@ __global__ void kernel_KaKjw_to_QKajw(int nwalk, int na = nocc[K]; int nj = nmo[QK]; - T const* A_(A + (na0 * nmo_tot + nj0) * nwalk); - T2* B_(B + ((Q * nkpts + K) * nocc_max) * nmo_max * nwalk); + T const* A_(A + (na0 * npol * nmo_tot + nj0) * nwalk); + T2* B_(B + ((Q * nkpts + K) * nocc_max) * npol * nmo_max * nwalk); if (threadIdx.x >= nj) return; if (threadIdx.y >= nwalk) return; - for (int a = 0, a0 = 0, a1 = 0; a < na; a++, a0 += nmo_max * nwalk, a1 += nmo_tot * nwalk) + for (int a = 0, a0 = pol*nmo_max*nwalk, a1 = pol*nmo_tot*nwalk; + a < na; a++, a0 += npol * nmo_max * nwalk, a1 += npol * nmo_tot * nwalk) for (int j = threadIdx.x; j < nj; j += blockDim.x) for (int n = threadIdx.y; n < nwalk; n += blockDim.y) B_[a0 + j * nwalk + n] = static_cast(A_[a1 + j * nwalk + n]); @@ -67,6 +70,7 @@ __global__ void kernel_KaKjw_to_QKajw(int nwalk, template __global__ void kernel_KaKjw_to_QKajw(int nwalk, int nkpts, + int npol, int nmo_max, int nmo_tot, int nocc_max, @@ -80,7 +84,8 @@ __global__ void kernel_KaKjw_to_QKajw(int nwalk, { int Q = blockIdx.x; int K = blockIdx.y; - if (Q >= nkpts || K >= nkpts) + int pol = blockIdx.z; + if (Q >= nkpts || K >= nkpts || pol > npol) return; int QK = QKtok2[Q * nkpts + K]; int na0 = nocc0[K]; @@ -88,15 +93,16 @@ __global__ void kernel_KaKjw_to_QKajw(int nwalk, int na = nocc[K]; int nj = nmo[QK]; - thrust::complex const* A_(A + (na0 * nmo_tot + nj0) * nwalk); - thrust::complex* B_(B + ((Q * nkpts + K) * nocc_max) * nmo_max * nwalk); + thrust::complex const* A_(A + (na0 * npol * nmo_tot + nj0) * nwalk); + thrust::complex* B_(B + ((Q * nkpts + K) * nocc_max) * npol * nmo_max * nwalk); if (threadIdx.x >= nj) return; if (threadIdx.y >= nwalk) return; - for (int a = 0, a0 = 0, a1 = 0; a < na; a++, a0 += nmo_max * nwalk, a1 += nmo_tot * nwalk) + for (int a = 0, a0 = pol*nmo_max*nwalk, a1 = pol*nmo_tot*nwalk; + a < na; a++, a0 += npol*nmo_max * nwalk, a1 += npol * nmo_tot * nwalk) { for (int j = threadIdx.x; j < nj; j += blockDim.x) for (int n = threadIdx.y; n < nwalk; n += blockDim.y) @@ -106,6 +112,7 @@ __global__ void kernel_KaKjw_to_QKajw(int nwalk, void KaKjw_to_QKajw(int nwalk, int nkpts, + int npol, int nmo_max, int nmo_tot, int nocc_max, @@ -120,8 +127,8 @@ void KaKjw_to_QKajw(int nwalk, int xblock_dim = 16; int yblock_dim = std::min(nwalk, 32); dim3 block_dim(xblock_dim, yblock_dim, 1); - dim3 grid_dim(nkpts, nkpts, 1); - kernel_KaKjw_to_QKajw<<>>(nwalk, nkpts, nmo_max, nmo_tot, nocc_max, nmo, nmo0, nocc, nocc0, + dim3 grid_dim(nkpts, nkpts, npol); + kernel_KaKjw_to_QKajw<<>>(nwalk, nkpts, npol, nmo_max, nmo_tot, nocc_max, nmo, nmo0, nocc, nocc0, QKtok2, A, B); qmc_cuda::cuda_check(cudaGetLastError(), "KaKjw_to_QKajw"); qmc_cuda::cuda_check(cudaDeviceSynchronize(), "KaKjw_to_QKajw"); @@ -129,6 +136,7 @@ void KaKjw_to_QKajw(int nwalk, void KaKjw_to_QKajw(int nwalk, int nkpts, + int npol, int nmo_max, int nmo_tot, int nocc_max, @@ -143,8 +151,8 @@ void KaKjw_to_QKajw(int nwalk, int xblock_dim = 16; int yblock_dim = std::min(nwalk, 32); dim3 block_dim(xblock_dim, yblock_dim, 1); - dim3 grid_dim(nkpts, nkpts, 1); - kernel_KaKjw_to_QKajw<<>>(nwalk, nkpts, nmo_max, nmo_tot, nocc_max, nmo, nmo0, nocc, nocc0, + dim3 grid_dim(nkpts, nkpts, npol); + kernel_KaKjw_to_QKajw<<>>(nwalk, nkpts, npol, nmo_max, nmo_tot, nocc_max, nmo, nmo0, nocc, nocc0, QKtok2, A, B); qmc_cuda::cuda_check(cudaGetLastError(), "KaKjw_to_QKajw"); qmc_cuda::cuda_check(cudaDeviceSynchronize(), "KaKjw_to_QKajw"); @@ -152,6 +160,7 @@ void KaKjw_to_QKajw(int nwalk, void KaKjw_to_QKajw(int nwalk, int nkpts, + int npol, int nmo_max, int nmo_tot, int nocc_max, @@ -166,8 +175,8 @@ void KaKjw_to_QKajw(int nwalk, int xblock_dim = 16; int yblock_dim = std::min(nwalk, 32); dim3 block_dim(xblock_dim, yblock_dim, 1); - dim3 grid_dim(nkpts, nkpts, 1); - kernel_KaKjw_to_QKajw<<>>(nwalk, nkpts, nmo_max, nmo_tot, nocc_max, nmo, nmo0, nocc, nocc0, + dim3 grid_dim(nkpts, nkpts, npol); + kernel_KaKjw_to_QKajw<<>>(nwalk, nkpts, npol, nmo_max, nmo_tot, nocc_max, nmo, nmo0, nocc, nocc0, QKtok2, A, B); qmc_cuda::cuda_check(cudaGetLastError(), "KaKjw_to_QKajw"); qmc_cuda::cuda_check(cudaDeviceSynchronize(), "KaKjw_to_QKajw"); @@ -175,6 +184,7 @@ void KaKjw_to_QKajw(int nwalk, void KaKjw_to_QKajw(int nwalk, int nkpts, + int npol, int nmo_max, int nmo_tot, int nocc_max, @@ -189,8 +199,8 @@ void KaKjw_to_QKajw(int nwalk, int xblock_dim = 16; int yblock_dim = std::min(nwalk, 32); dim3 block_dim(xblock_dim, yblock_dim, 1); - dim3 grid_dim(nkpts, nkpts, 1); - kernel_KaKjw_to_QKajw<<>>(nwalk, nkpts, nmo_max, nmo_tot, nocc_max, nmo, nmo0, nocc, nocc0, + dim3 grid_dim(nkpts, nkpts, npol); + kernel_KaKjw_to_QKajw<<>>(nwalk, nkpts, npol, nmo_max, nmo_tot, nocc_max, nmo, nmo0, nocc, nocc0, QKtok2, reinterpret_cast const*>(A), reinterpret_cast*>(B)); qmc_cuda::cuda_check(cudaGetLastError(), "KaKjw_to_QKajw"); @@ -199,6 +209,7 @@ void KaKjw_to_QKajw(int nwalk, void KaKjw_to_QKajw(int nwalk, int nkpts, + int npol, int nmo_max, int nmo_tot, int nocc_max, @@ -213,8 +224,8 @@ void KaKjw_to_QKajw(int nwalk, int xblock_dim = 16; int yblock_dim = std::min(nwalk, 32); dim3 block_dim(xblock_dim, yblock_dim, 1); - dim3 grid_dim(nkpts, nkpts, 1); - kernel_KaKjw_to_QKajw<<>>(nwalk, nkpts, nmo_max, nmo_tot, nocc_max, nmo, nmo0, nocc, nocc0, + dim3 grid_dim(nkpts, nkpts, npol); + kernel_KaKjw_to_QKajw<<>>(nwalk, nkpts, npol, nmo_max, nmo_tot, nocc_max, nmo, nmo0, nocc, nocc0, QKtok2, reinterpret_cast const*>(A), reinterpret_cast*>(B)); qmc_cuda::cuda_check(cudaGetLastError(), "KaKjw_to_QKajw"); @@ -223,6 +234,7 @@ void KaKjw_to_QKajw(int nwalk, void KaKjw_to_QKajw(int nwalk, int nkpts, + int npol, int nmo_max, int nmo_tot, int nocc_max, @@ -237,8 +249,8 @@ void KaKjw_to_QKajw(int nwalk, int xblock_dim = 16; int yblock_dim = std::min(nwalk, 32); dim3 block_dim(xblock_dim, yblock_dim, 1); - dim3 grid_dim(nkpts, nkpts, 1); - kernel_KaKjw_to_QKajw<<>>(nwalk, nkpts, nmo_max, nmo_tot, nocc_max, nmo, nmo0, nocc, nocc0, + dim3 grid_dim(nkpts, nkpts, npol); + kernel_KaKjw_to_QKajw<<>>(nwalk, nkpts, npol, nmo_max, nmo_tot, nocc_max, nmo, nmo0, nocc, nocc0, QKtok2, reinterpret_cast const*>(A), reinterpret_cast*>(B)); qmc_cuda::cuda_check(cudaGetLastError(), "KaKjw_to_QKajw"); diff --git a/src/AFQMC/Numerics/detail/CUDA/Kernels/KaKjw_to_QKajw.cuh b/src/AFQMC/Numerics/detail/CUDA/Kernels/KaKjw_to_QKajw.cuh index 164e6c5988..0315a29b9a 100644 --- a/src/AFQMC/Numerics/detail/CUDA/Kernels/KaKjw_to_QKajw.cuh +++ b/src/AFQMC/Numerics/detail/CUDA/Kernels/KaKjw_to_QKajw.cuh @@ -23,6 +23,7 @@ namespace kernels { void KaKjw_to_QKajw(int nwalk, int nkpts, + int npol, int nmo_max, int nmo_tot, int nocc_max, @@ -35,6 +36,7 @@ void KaKjw_to_QKajw(int nwalk, double* B); void KaKjw_to_QKajw(int nwalk, int nkpts, + int npol, int nmo_max, int nmo_tot, int nocc_max, @@ -47,6 +49,7 @@ void KaKjw_to_QKajw(int nwalk, float* B); void KaKjw_to_QKajw(int nwalk, int nkpts, + int npol, int nmo_max, int nmo_tot, int nocc_max, @@ -59,6 +62,7 @@ void KaKjw_to_QKajw(int nwalk, float* B); void KaKjw_to_QKajw(int nwalk, int nkpts, + int npol, int nmo_max, int nmo_tot, int nocc_max, @@ -71,6 +75,7 @@ void KaKjw_to_QKajw(int nwalk, std::complex* B); void KaKjw_to_QKajw(int nwalk, int nkpts, + int npol, int nmo_max, int nmo_tot, int nocc_max, @@ -83,6 +88,7 @@ void KaKjw_to_QKajw(int nwalk, std::complex* B); void KaKjw_to_QKajw(int nwalk, int nkpts, + int npol, int nmo_max, int nmo_tot, int nocc_max, diff --git a/src/AFQMC/Numerics/detail/HIP/Kernels/KaKjw_to_KKwaj.hip.cpp b/src/AFQMC/Numerics/detail/HIP/Kernels/KaKjw_to_KKwaj.hip.cpp index b4bd304bae..0bbcd01573 100644 --- a/src/AFQMC/Numerics/detail/HIP/Kernels/KaKjw_to_KKwaj.hip.cpp +++ b/src/AFQMC/Numerics/detail/HIP/Kernels/KaKjw_to_KKwaj.hip.cpp @@ -25,6 +25,7 @@ namespace kernels template __global__ void kernel_KaKjw_to_KKwaj(int nwalk, int nkpts, + int npol, int nmo_max, int nmo_tot, int nocc_max, @@ -37,30 +38,32 @@ __global__ void kernel_KaKjw_to_KKwaj(int nwalk, { int Ka = blockIdx.x; int Kj = blockIdx.y; - if (Ka >= nkpts || Kj >= nkpts) + int pol = blockIdx.z; + if (Ka >= nkpts || Kj >= nkpts || pol >= npol) return; int na0 = nocc0[Ka]; int nj0 = nmo0[Kj]; int na = nocc[Ka]; int nj = nmo[Kj]; - T const* A_(A + (na0 * nmo_tot + nj0) * nwalk); - T2* B_(B + ((Ka * nkpts + Kj) * nocc_max) * nmo_max * nwalk); + T const* A_(A + (na0 * npol * nmo_tot + nj0) * nwalk); + T2* B_(B + ((Ka * nkpts + Kj) * nocc_max) * npol * nmo_max * nwalk); if (threadIdx.x >= nj) return; if (threadIdx.y >= nwalk) return; - for (int a = 0, a1 = 0; a < na; a++, a1 += nmo_tot * nwalk) + for (int a = 0, a1 = pol * nmo_tot * nwalk; a < na; a++, a1 += npol * nmo_tot * nwalk) for (int j = threadIdx.x; j < nj; j += blockDim.x) for (int n = threadIdx.y; n < nwalk; n += blockDim.y) - B_[n * nocc_max * nmo_max + a * nmo_max + j] = static_cast(A_[a1 + j * nwalk + n]); + B_[((n * nocc_max + a) * npol + pol) * nmo_max + j] = static_cast(A_[a1 + j * nwalk + n]); } template __global__ void kernel_KaKjw_to_KKwaj(int nwalk, int nkpts, + int npol, int nmo_max, int nmo_tot, int nocc_max, @@ -73,30 +76,32 @@ __global__ void kernel_KaKjw_to_KKwaj(int nwalk, { int Ka = blockIdx.x; int Kj = blockIdx.y; - if (Ka >= nkpts || Kj >= nkpts) + int pol = blockIdx.z; + if (Ka >= nkpts || Kj >= nkpts || pol >= npol) return; int na0 = nocc0[Ka]; int nj0 = nmo0[Kj]; int na = nocc[Ka]; int nj = nmo[Kj]; - thrust::complex const* A_(A + (na0 * nmo_tot + nj0) * nwalk); - thrust::complex* B_(B + ((Ka * nkpts + Kj) * nocc_max) * nmo_max * nwalk); + thrust::complex const* A_(A + (na0 * npol * nmo_tot + nj0) * nwalk); + thrust::complex* B_(B + ((Ka * nkpts + Kj) * nocc_max) * npol * nmo_max * nwalk); if (threadIdx.x >= nj) return; if (threadIdx.y >= nwalk) return; - for (int a = 0, a1 = 0; a < na; a++, a1 += nmo_tot * nwalk) + for (int a = 0, a1 = pol * nmo_tot * nwalk; a < na; a++, a1 += npol * nmo_tot * nwalk) for (int j = threadIdx.x; j < nj; j += blockDim.x) for (int n = threadIdx.y; n < nwalk; n += blockDim.y) - B_[n * nocc_max * nmo_max + a * nmo_max + j] = static_cast>(A_[a1 + j * nwalk + n]); + B_[((n * nocc_max + a) * npol + pol) * nmo_max + j] = static_cast>(A_[a1 + j * nwalk + n]); } void KaKjw_to_KKwaj(int nwalk, - int nkpts, + int nkpts, + int npol, int nmo_max, int nmo_tot, int nocc_max, @@ -110,8 +115,8 @@ void KaKjw_to_KKwaj(int nwalk, int xblock_dim = 16; int yblock_dim = std::min(nwalk, 32); dim3 block_dim(xblock_dim, yblock_dim, 1); - dim3 grid_dim(nkpts, nkpts, 1); - hipLaunchKernelGGL(kernel_KaKjw_to_KKwaj, dim3(grid_dim), dim3(block_dim), 0, 0, nwalk, nkpts, nmo_max, nmo_tot, + dim3 grid_dim(nkpts, nkpts, npol); + hipLaunchKernelGGL(kernel_KaKjw_to_KKwaj, dim3(grid_dim), dim3(block_dim), 0, 0, nwalk, nkpts, npol, nmo_max, nmo_tot, nocc_max, nmo, nmo0, nocc, nocc0, A, B); qmc_hip::hip_check(hipGetLastError(), "KaKjw_to_KKwaj"); qmc_hip::hip_check(hipDeviceSynchronize(), "KaKjw_to_KKwaj"); @@ -119,6 +124,7 @@ void KaKjw_to_KKwaj(int nwalk, void KaKjw_to_KKwaj(int nwalk, int nkpts, + int npol, int nmo_max, int nmo_tot, int nocc_max, @@ -132,8 +138,8 @@ void KaKjw_to_KKwaj(int nwalk, int xblock_dim = 16; int yblock_dim = std::min(nwalk, 32); dim3 block_dim(xblock_dim, yblock_dim, 1); - dim3 grid_dim(nkpts, nkpts, 1); - hipLaunchKernelGGL(kernel_KaKjw_to_KKwaj, dim3(grid_dim), dim3(block_dim), 0, 0, nwalk, nkpts, nmo_max, nmo_tot, + dim3 grid_dim(nkpts, nkpts, npol); + hipLaunchKernelGGL(kernel_KaKjw_to_KKwaj, dim3(grid_dim), dim3(block_dim), 0, 0, nwalk, nkpts, npol, nmo_max, nmo_tot, nocc_max, nmo, nmo0, nocc, nocc0, A, B); qmc_hip::hip_check(hipGetLastError(), "KaKjw_to_KKwaj"); qmc_hip::hip_check(hipDeviceSynchronize(), "KaKjw_to_KKwaj"); @@ -141,6 +147,7 @@ void KaKjw_to_KKwaj(int nwalk, void KaKjw_to_KKwaj(int nwalk, int nkpts, + int npol, int nmo_max, int nmo_tot, int nocc_max, @@ -154,8 +161,8 @@ void KaKjw_to_KKwaj(int nwalk, int xblock_dim = 16; int yblock_dim = std::min(nwalk, 32); dim3 block_dim(xblock_dim, yblock_dim, 1); - dim3 grid_dim(nkpts, nkpts, 1); - hipLaunchKernelGGL(kernel_KaKjw_to_KKwaj, dim3(grid_dim), dim3(block_dim), 0, 0, nwalk, nkpts, nmo_max, nmo_tot, + dim3 grid_dim(nkpts, nkpts, npol); + hipLaunchKernelGGL(kernel_KaKjw_to_KKwaj, dim3(grid_dim), dim3(block_dim), 0, 0, nwalk, nkpts, npol, nmo_max, nmo_tot, nocc_max, nmo, nmo0, nocc, nocc0, A, B); qmc_hip::hip_check(hipGetLastError(), "KaKjw_to_KKwaj"); qmc_hip::hip_check(hipDeviceSynchronize(), "KaKjw_to_KKwaj"); @@ -163,6 +170,7 @@ void KaKjw_to_KKwaj(int nwalk, void KaKjw_to_KKwaj(int nwalk, int nkpts, + int npol, int nmo_max, int nmo_tot, int nocc_max, @@ -176,8 +184,8 @@ void KaKjw_to_KKwaj(int nwalk, int xblock_dim = 16; int yblock_dim = std::min(nwalk, 32); dim3 block_dim(xblock_dim, yblock_dim, 1); - dim3 grid_dim(nkpts, nkpts, 1); - hipLaunchKernelGGL(kernel_KaKjw_to_KKwaj, dim3(grid_dim), dim3(block_dim), 0, 0, nwalk, nkpts, nmo_max, nmo_tot, + dim3 grid_dim(nkpts, nkpts, npol); + hipLaunchKernelGGL(kernel_KaKjw_to_KKwaj, dim3(grid_dim), dim3(block_dim), 0, 0, nwalk, nkpts, npol, nmo_max, nmo_tot, nocc_max, nmo, nmo0, nocc, nocc0, reinterpret_cast const*>(A), reinterpret_cast*>(B)); qmc_hip::hip_check(hipGetLastError(), "KaKjw_to_KKwaj"); @@ -186,6 +194,7 @@ void KaKjw_to_KKwaj(int nwalk, void KaKjw_to_KKwaj(int nwalk, int nkpts, + int npol, int nmo_max, int nmo_tot, int nocc_max, @@ -199,8 +208,8 @@ void KaKjw_to_KKwaj(int nwalk, int xblock_dim = 16; int yblock_dim = std::min(nwalk, 32); dim3 block_dim(xblock_dim, yblock_dim, 1); - dim3 grid_dim(nkpts, nkpts, 1); - hipLaunchKernelGGL(kernel_KaKjw_to_KKwaj, dim3(grid_dim), dim3(block_dim), 0, 0, nwalk, nkpts, nmo_max, nmo_tot, + dim3 grid_dim(nkpts, nkpts, npol); + hipLaunchKernelGGL(kernel_KaKjw_to_KKwaj, dim3(grid_dim), dim3(block_dim), 0, 0, nwalk, nkpts, npol, nmo_max, nmo_tot, nocc_max, nmo, nmo0, nocc, nocc0, reinterpret_cast const*>(A), reinterpret_cast*>(B)); qmc_hip::hip_check(hipGetLastError(), "KaKjw_to_KKwaj"); @@ -209,6 +218,7 @@ void KaKjw_to_KKwaj(int nwalk, void KaKjw_to_KKwaj(int nwalk, int nkpts, + int npol, int nmo_max, int nmo_tot, int nocc_max, @@ -222,8 +232,8 @@ void KaKjw_to_KKwaj(int nwalk, int xblock_dim = 16; int yblock_dim = std::min(nwalk, 32); dim3 block_dim(xblock_dim, yblock_dim, 1); - dim3 grid_dim(nkpts, nkpts, 1); - hipLaunchKernelGGL(kernel_KaKjw_to_KKwaj, dim3(grid_dim), dim3(block_dim), 0, 0, nwalk, nkpts, nmo_max, nmo_tot, + dim3 grid_dim(nkpts, nkpts, npol); + hipLaunchKernelGGL(kernel_KaKjw_to_KKwaj, dim3(grid_dim), dim3(block_dim), 0, 0, nwalk, nkpts, npol, nmo_max, nmo_tot, nocc_max, nmo, nmo0, nocc, nocc0, reinterpret_cast const*>(A), reinterpret_cast*>(B)); qmc_hip::hip_check(hipGetLastError(), "KaKjw_to_KKwaj"); diff --git a/src/AFQMC/Numerics/detail/HIP/Kernels/KaKjw_to_KKwaj.hip.h b/src/AFQMC/Numerics/detail/HIP/Kernels/KaKjw_to_KKwaj.hip.h index 1fa43f4927..ebab86b49b 100644 --- a/src/AFQMC/Numerics/detail/HIP/Kernels/KaKjw_to_KKwaj.hip.h +++ b/src/AFQMC/Numerics/detail/HIP/Kernels/KaKjw_to_KKwaj.hip.h @@ -20,6 +20,7 @@ namespace kernels { void KaKjw_to_KKwaj(int nwalk, int nkpts, + int npol, int nmo_max, int nmo_tot, int nocc_max, @@ -31,6 +32,7 @@ void KaKjw_to_KKwaj(int nwalk, double* B); void KaKjw_to_KKwaj(int nwalk, int nkpts, + int npol, int nmo_max, int nmo_tot, int nocc_max, @@ -42,6 +44,7 @@ void KaKjw_to_KKwaj(int nwalk, float* B); void KaKjw_to_KKwaj(int nwalk, int nkpts, + int npol, int nmo_max, int nmo_tot, int nocc_max, @@ -53,6 +56,7 @@ void KaKjw_to_KKwaj(int nwalk, float* B); void KaKjw_to_KKwaj(int nwalk, int nkpts, + int npol, int nmo_max, int nmo_tot, int nocc_max, @@ -64,6 +68,7 @@ void KaKjw_to_KKwaj(int nwalk, std::complex* B); void KaKjw_to_KKwaj(int nwalk, int nkpts, + int npol, int nmo_max, int nmo_tot, int nocc_max, @@ -75,6 +80,7 @@ void KaKjw_to_KKwaj(int nwalk, std::complex* B); void KaKjw_to_KKwaj(int nwalk, int nkpts, + int npol, int nmo_max, int nmo_tot, int nocc_max, diff --git a/src/AFQMC/Numerics/detail/HIP/Kernels/KaKjw_to_QKajw.hip.cpp b/src/AFQMC/Numerics/detail/HIP/Kernels/KaKjw_to_QKajw.hip.cpp index 9b9ca77ccd..6fd46e0bc1 100644 --- a/src/AFQMC/Numerics/detail/HIP/Kernels/KaKjw_to_QKajw.hip.cpp +++ b/src/AFQMC/Numerics/detail/HIP/Kernels/KaKjw_to_QKajw.hip.cpp @@ -26,6 +26,7 @@ namespace kernels template __global__ void kernel_KaKjw_to_QKajw(int nwalk, int nkpts, + int npol, int nmo_max, int nmo_tot, int nocc_max, @@ -39,7 +40,8 @@ __global__ void kernel_KaKjw_to_QKajw(int nwalk, { int Q = blockIdx.x; int K = blockIdx.y; - if (Q >= nkpts || K >= nkpts) + int pol = blockIdx.z; + if (Q >= nkpts || K >= nkpts || pol > npol) return; int QK = QKtok2[Q * nkpts + K]; int na0 = nocc0[K]; @@ -47,15 +49,16 @@ __global__ void kernel_KaKjw_to_QKajw(int nwalk, int na = nocc[K]; int nj = nmo[QK]; - T const* A_(A + (na0 * nmo_tot + nj0) * nwalk); - T2* B_(B + ((Q * nkpts + K) * nocc_max) * nmo_max * nwalk); + T const* A_(A + (na0 * npol * nmo_tot + nj0) * nwalk); + T2* B_(B + ((Q * nkpts + K) * nocc_max) * npol * nmo_max * nwalk); if (threadIdx.x >= nj) return; if (threadIdx.y >= nwalk) return; - for (int a = 0, a0 = 0, a1 = 0; a < na; a++, a0 += nmo_max * nwalk, a1 += nmo_tot * nwalk) + for (int a = 0, a0 = pol*nmo_max*nwalk, a1 = pol*nmo_tot*nwalk; + a < na; a++, a0 += npol * nmo_max * nwalk, a1 += npol * nmo_tot * nwalk) for (int j = threadIdx.x; j < nj; j += blockDim.x) for (int n = threadIdx.y; n < nwalk; n += blockDim.y) B_[a0 + j * nwalk + n] = static_cast(A_[a1 + j * nwalk + n]); @@ -64,6 +67,7 @@ __global__ void kernel_KaKjw_to_QKajw(int nwalk, template __global__ void kernel_KaKjw_to_QKajw(int nwalk, int nkpts, + int npol, int nmo_max, int nmo_tot, int nocc_max, @@ -77,7 +81,8 @@ __global__ void kernel_KaKjw_to_QKajw(int nwalk, { int Q = blockIdx.x; int K = blockIdx.y; - if (Q >= nkpts || K >= nkpts) + int pol = blockIdx.z; + if (Q >= nkpts || K >= nkpts || pol > npol) return; int QK = QKtok2[Q * nkpts + K]; int na0 = nocc0[K]; @@ -85,15 +90,16 @@ __global__ void kernel_KaKjw_to_QKajw(int nwalk, int na = nocc[K]; int nj = nmo[QK]; - thrust::complex const* A_(A + (na0 * nmo_tot + nj0) * nwalk); - thrust::complex* B_(B + ((Q * nkpts + K) * nocc_max) * nmo_max * nwalk); + thrust::complex const* A_(A + (na0 * npol * nmo_tot + nj0) * nwalk); + thrust::complex* B_(B + ((Q * nkpts + K) * npol * nocc_max) * nmo_max * nwalk); if (threadIdx.x >= nj) return; if (threadIdx.y >= nwalk) return; - for (int a = 0, a0 = 0, a1 = 0; a < na; a++, a0 += nmo_max * nwalk, a1 += nmo_tot * nwalk) + for (int a = 0, a0 = pol*nmo_max*nwalk, a1 = pol*nmo_tot*nwalk; + a < na; a++, a0 += npol * nmo_max * nwalk, a1 += npol * nmo_tot * nwalk) { for (int j = threadIdx.x; j < nj; j += blockDim.x) for (int n = threadIdx.y; n < nwalk; n += blockDim.y) @@ -103,6 +109,7 @@ __global__ void kernel_KaKjw_to_QKajw(int nwalk, void KaKjw_to_QKajw(int nwalk, int nkpts, + int npol, int nmo_max, int nmo_tot, int nocc_max, @@ -117,8 +124,8 @@ void KaKjw_to_QKajw(int nwalk, int xblock_dim = 16; int yblock_dim = std::min(nwalk, 32); dim3 block_dim(xblock_dim, yblock_dim, 1); - dim3 grid_dim(nkpts, nkpts, 1); - hipLaunchKernelGGL(kernel_KaKjw_to_QKajw, dim3(grid_dim), dim3(block_dim), 0, 0, nwalk, nkpts, nmo_max, nmo_tot, + dim3 grid_dim(nkpts, nkpts, npol); + hipLaunchKernelGGL(kernel_KaKjw_to_QKajw, dim3(grid_dim), dim3(block_dim), 0, 0, nwalk, nkpts, npol, nmo_max, nmo_tot, nocc_max, nmo, nmo0, nocc, nocc0, QKtok2, A, B); qmc_hip::hip_check(hipGetLastError(), "KaKjw_to_QKajw"); qmc_hip::hip_check(hipDeviceSynchronize(), "KaKjw_to_QKajw"); @@ -126,6 +133,7 @@ void KaKjw_to_QKajw(int nwalk, void KaKjw_to_QKajw(int nwalk, int nkpts, + int npol, int nmo_max, int nmo_tot, int nocc_max, @@ -140,8 +148,8 @@ void KaKjw_to_QKajw(int nwalk, int xblock_dim = 16; int yblock_dim = std::min(nwalk, 32); dim3 block_dim(xblock_dim, yblock_dim, 1); - dim3 grid_dim(nkpts, nkpts, 1); - hipLaunchKernelGGL(kernel_KaKjw_to_QKajw, dim3(grid_dim), dim3(block_dim), 0, 0, nwalk, nkpts, nmo_max, nmo_tot, + dim3 grid_dim(nkpts, nkpts, npol); + hipLaunchKernelGGL(kernel_KaKjw_to_QKajw, dim3(grid_dim), dim3(block_dim), 0, 0, nwalk, nkpts, npol, nmo_max, nmo_tot, nocc_max, nmo, nmo0, nocc, nocc0, QKtok2, A, B); qmc_hip::hip_check(hipGetLastError(), "KaKjw_to_QKajw"); qmc_hip::hip_check(hipDeviceSynchronize(), "KaKjw_to_QKajw"); @@ -149,6 +157,7 @@ void KaKjw_to_QKajw(int nwalk, void KaKjw_to_QKajw(int nwalk, int nkpts, + int npol, int nmo_max, int nmo_tot, int nocc_max, @@ -163,8 +172,8 @@ void KaKjw_to_QKajw(int nwalk, int xblock_dim = 16; int yblock_dim = std::min(nwalk, 32); dim3 block_dim(xblock_dim, yblock_dim, 1); - dim3 grid_dim(nkpts, nkpts, 1); - hipLaunchKernelGGL(kernel_KaKjw_to_QKajw, dim3(grid_dim), dim3(block_dim), 0, 0, nwalk, nkpts, nmo_max, nmo_tot, + dim3 grid_dim(nkpts, nkpts, npol); + hipLaunchKernelGGL(kernel_KaKjw_to_QKajw, dim3(grid_dim), dim3(block_dim), 0, 0, nwalk, nkpts, npol, nmo_max, nmo_tot, nocc_max, nmo, nmo0, nocc, nocc0, QKtok2, A, B); qmc_hip::hip_check(hipGetLastError(), "KaKjw_to_QKajw"); qmc_hip::hip_check(hipDeviceSynchronize(), "KaKjw_to_QKajw"); @@ -172,6 +181,7 @@ void KaKjw_to_QKajw(int nwalk, void KaKjw_to_QKajw(int nwalk, int nkpts, + int npol, int nmo_max, int nmo_tot, int nocc_max, @@ -186,8 +196,8 @@ void KaKjw_to_QKajw(int nwalk, int xblock_dim = 16; int yblock_dim = std::min(nwalk, 32); dim3 block_dim(xblock_dim, yblock_dim, 1); - dim3 grid_dim(nkpts, nkpts, 1); - hipLaunchKernelGGL(kernel_KaKjw_to_QKajw, dim3(grid_dim), dim3(block_dim), 0, 0, nwalk, nkpts, nmo_max, nmo_tot, + dim3 grid_dim(nkpts, nkpts, npol); + hipLaunchKernelGGL(kernel_KaKjw_to_QKajw, dim3(grid_dim), dim3(block_dim), 0, 0, nwalk, nkpts, npol, nmo_max, nmo_tot, nocc_max, nmo, nmo0, nocc, nocc0, QKtok2, reinterpret_cast const*>(A), reinterpret_cast*>(B)); qmc_hip::hip_check(hipGetLastError(), "KaKjw_to_QKajw"); @@ -196,6 +206,7 @@ void KaKjw_to_QKajw(int nwalk, void KaKjw_to_QKajw(int nwalk, int nkpts, + int npol, int nmo_max, int nmo_tot, int nocc_max, @@ -210,8 +221,8 @@ void KaKjw_to_QKajw(int nwalk, int xblock_dim = 16; int yblock_dim = std::min(nwalk, 32); dim3 block_dim(xblock_dim, yblock_dim, 1); - dim3 grid_dim(nkpts, nkpts, 1); - hipLaunchKernelGGL(kernel_KaKjw_to_QKajw, dim3(grid_dim), dim3(block_dim), 0, 0, nwalk, nkpts, nmo_max, nmo_tot, + dim3 grid_dim(nkpts, nkpts, npol); + hipLaunchKernelGGL(kernel_KaKjw_to_QKajw, dim3(grid_dim), dim3(block_dim), 0, 0, nwalk, nkpts, npol, nmo_max, nmo_tot, nocc_max, nmo, nmo0, nocc, nocc0, QKtok2, reinterpret_cast const*>(A), reinterpret_cast*>(B)); qmc_hip::hip_check(hipGetLastError(), "KaKjw_to_QKajw"); @@ -220,6 +231,7 @@ void KaKjw_to_QKajw(int nwalk, void KaKjw_to_QKajw(int nwalk, int nkpts, + int npol, int nmo_max, int nmo_tot, int nocc_max, @@ -234,8 +246,8 @@ void KaKjw_to_QKajw(int nwalk, int xblock_dim = 16; int yblock_dim = std::min(nwalk, 32); dim3 block_dim(xblock_dim, yblock_dim, 1); - dim3 grid_dim(nkpts, nkpts, 1); - hipLaunchKernelGGL(kernel_KaKjw_to_QKajw, dim3(grid_dim), dim3(block_dim), 0, 0, nwalk, nkpts, nmo_max, nmo_tot, + dim3 grid_dim(nkpts, nkpts, npol); + hipLaunchKernelGGL(kernel_KaKjw_to_QKajw, dim3(grid_dim), dim3(block_dim), 0, 0, nwalk, nkpts, npol, nmo_max, nmo_tot, nocc_max, nmo, nmo0, nocc, nocc0, QKtok2, reinterpret_cast const*>(A), reinterpret_cast*>(B)); qmc_hip::hip_check(hipGetLastError(), "KaKjw_to_QKajw"); diff --git a/src/AFQMC/Numerics/detail/HIP/Kernels/KaKjw_to_QKajw.hip.h b/src/AFQMC/Numerics/detail/HIP/Kernels/KaKjw_to_QKajw.hip.h index e773dce130..c8b84a0219 100644 --- a/src/AFQMC/Numerics/detail/HIP/Kernels/KaKjw_to_QKajw.hip.h +++ b/src/AFQMC/Numerics/detail/HIP/Kernels/KaKjw_to_QKajw.hip.h @@ -20,6 +20,7 @@ namespace kernels { void KaKjw_to_QKajw(int nwalk, int nkpts, + int npol, int nmo_max, int nmo_tot, int nocc_max, @@ -32,6 +33,7 @@ void KaKjw_to_QKajw(int nwalk, double* B); void KaKjw_to_QKajw(int nwalk, int nkpts, + int npol, int nmo_max, int nmo_tot, int nocc_max, @@ -44,6 +46,7 @@ void KaKjw_to_QKajw(int nwalk, float* B); void KaKjw_to_QKajw(int nwalk, int nkpts, + int npol, int nmo_max, int nmo_tot, int nocc_max, @@ -56,6 +59,7 @@ void KaKjw_to_QKajw(int nwalk, float* B); void KaKjw_to_QKajw(int nwalk, int nkpts, + int npol, int nmo_max, int nmo_tot, int nocc_max, @@ -68,6 +72,7 @@ void KaKjw_to_QKajw(int nwalk, std::complex* B); void KaKjw_to_QKajw(int nwalk, int nkpts, + int npol, int nmo_max, int nmo_tot, int nocc_max, @@ -80,6 +85,7 @@ void KaKjw_to_QKajw(int nwalk, std::complex* B); void KaKjw_to_QKajw(int nwalk, int nkpts, + int npol, int nmo_max, int nmo_tot, int nocc_max, diff --git a/src/AFQMC/Numerics/tensor_operations.hpp b/src/AFQMC/Numerics/tensor_operations.hpp index f4ea13191d..e93757a64b 100644 --- a/src/AFQMC/Numerics/tensor_operations.hpp +++ b/src/AFQMC/Numerics/tensor_operations.hpp @@ -29,6 +29,7 @@ namespace ma template void KaKjw_to_KKwaj(int nwalk, int nkpts, + int npol, int nmo_max, int nmo_tot, int nocc_max, @@ -40,7 +41,7 @@ void KaKjw_to_KKwaj(int nwalk, T* B) { // OpenMP: Combine Ka,Kj loops into single loop and call parallel for - int naj = nocc_max * nmo_max; + int napj = nocc_max * npol * nmo_max; int na0 = 0; for (int Ka = 0; Ka < nkpts; Ka++) { @@ -50,16 +51,19 @@ void KaKjw_to_KKwaj(int nwalk, { int nj = nopk[Kj]; //auto G_(to_address(GKK[0][Ka][Kj].origin())); - auto G_(B + (Ka * nkpts + Kj) * nwalk * nocc_max * nmo_max); + auto G_(B + (Ka * nkpts + Kj) * nwalk * nocc_max * npol * nmo_max); for (int a = 0; a < na; a++) { - //auto Gc_( to_address(Gca[na0+a][nj0].origin()) ); - auto Gc_(A + (na0 + a) * nmo_tot * nwalk + nj0 * nwalk); - int aj = a * nmo_max; - for (int j = 0; j < nj; j++, aj++) + //auto Gc_( to_address(Gca[na0+a][p][nj0].origin()) ); + int apj = a * npol * nmo_max; + for (int p = 0; p < npol; p++) { - for (int w = 0, waj = 0; w < nwalk; w++, ++Gc_, waj += naj) - G_[waj + aj] = static_cast(*Gc_); + auto Gc_(A + ((na0 + a) * npol + p ) * nmo_tot * nwalk + nj0 * nwalk); + for (int j = 0; j < nj; j++, apj++) + { + for (int w = 0, wapj = 0; w < nwalk; w++, ++Gc_, wapj += napj) + G_[wapj + apj] = static_cast(*Gc_); + } } } nj0 += nj; @@ -71,6 +75,7 @@ void KaKjw_to_KKwaj(int nwalk, template void KaKjw_to_QKajw(int nwalk, int nkpts, + int npol, int nmo_max, int nmo_tot, int nocc_max, @@ -94,16 +99,19 @@ void KaKjw_to_QKajw(int nwalk, int na0 = nocc0[Ka]; int nj0 = nmo0[Kj]; //auto G_(to_address(GKK[Q][K].origin())); - auto G_(B + (Q * nkpts + K) * nwalk * nocc_max * nmo_max); - for (int a = 0, a0 = 0; a < na; a++, a0 += nmo_max * nwalk) + auto G_(B + (Q * nkpts + K) * nwalk * nocc_max * npol * nmo_max); + for (int a = 0, a0 = 0; a < na; a++) { - //auto Gc_( to_address(Gca[na0+a][nj0].origin()) ); - auto Gc_(A + (na0 + a) * nmo_tot * nwalk + nj0 * nwalk); - for (int j = 0, aj = a0; j < nj; j++, aj += nwalk) + for (int p = 0; p < npol; p++, a0 += nmo_max * nwalk) { - for (int w = 0; w < nwalk; w++, ++Gc_) + //auto Gc_( to_address(Gca[na0+a][p][nj0].origin()) ); + auto Gc_(A + ((na0 + a) * npol + p) * nmo_tot * nwalk + nj0 * nwalk); + for (int j = 0, apj = a0; j < nj; j++, apj += nwalk) { - G_[aj + w] = static_cast(*Gc_); + for (int w = 0; w < nwalk; w++, ++Gc_) + { + G_[apj + w] = static_cast(*Gc_); + } } } } @@ -275,6 +283,7 @@ namespace device template void KaKjw_to_KKwaj(int nwalk, int nkpts, + int npol, int nmo_max, int nmo_tot, int nocc_max, @@ -285,13 +294,14 @@ void KaKjw_to_KKwaj(int nwalk, device_pointer A, device_pointer B) { - kernels::KaKjw_to_KKwaj(nwalk, nkpts, nmo_max, nmo_tot, nocc_max, to_address(nmo), to_address(nmo0), to_address(nocc), + kernels::KaKjw_to_KKwaj(nwalk, nkpts, npol, nmo_max, nmo_tot, nocc_max, to_address(nmo), to_address(nmo0), to_address(nocc), to_address(nocc0), to_address(A), to_address(B)); } template void KaKjw_to_QKajw(int nwalk, int nkpts, + int npol, int nmo_max, int nmo_tot, int nocc_max, @@ -303,7 +313,7 @@ void KaKjw_to_QKajw(int nwalk, device_pointer A, device_pointer B) { - kernels::KaKjw_to_QKajw(nwalk, nkpts, nmo_max, nmo_tot, nocc_max, to_address(nmo), to_address(nmo0), to_address(nocc), + kernels::KaKjw_to_QKajw(nwalk, nkpts, npol, nmo_max, nmo_tot, nocc_max, to_address(nmo), to_address(nmo0), to_address(nocc), to_address(nocc0), to_address(QKtok2), to_address(A), to_address(B)); } diff --git a/src/AFQMC/Numerics/tests/test_tensor_operations.cpp b/src/AFQMC/Numerics/tests/test_tensor_operations.cpp index 8b54c41241..b527aee87d 100644 --- a/src/AFQMC/Numerics/tests/test_tensor_operations.cpp +++ b/src/AFQMC/Numerics/tests/test_tensor_operations.cpp @@ -100,7 +100,7 @@ TEST_CASE("KaKjw_to_KKwaj", "[Numerics][tensor_operations]") // KaKjw = numpy.arange(nk*na*nk*nj*nw).reshape((nk,na,nk,nj,nw)) // KKwaj = numpy.transpose(KaKjw, (0,2,1,3,4)) using ma::KaKjw_to_KKwaj; - KaKjw_to_KKwaj(nwalk, nk, nmo_k, nk * nmo_k, occ_k, nmo_pk.origin(), nmo_pk0.origin(), nel_pk.origin(), + KaKjw_to_KKwaj(nwalk, nk, 1, nmo_k, nk * nmo_k, occ_k, nmo_pk.origin(), nmo_pk0.origin(), nel_pk.origin(), nel_pk0.origin(), KaKjw.origin(), KKwaj.origin()); // Reference values from python REQUIRE(KKwaj[1][2][3][4][4] == Approx(1473.0)); @@ -127,7 +127,7 @@ TEST_CASE("KaKjw_to_QKwaj", "[Numerics][tensor_operations]") Tensor2D qk_to_k2 = {{0, 1, 2, 3}, {1, 0, 3, 2}, {2, 3, 0, 1}, {3, 2, 1, 0}}; copy_n(buffer.data(), buffer.size(), KaKjw.origin()); using ma::KaKjw_to_QKajw; - KaKjw_to_QKajw(nwalk, nk, nmo_k, nk * nmo_k, occ_k, nmo_pk.origin(), nmo_pk0.origin(), nel_pk.origin(), + KaKjw_to_QKajw(nwalk, nk, 1, nmo_k, nk * nmo_k, occ_k, nmo_pk.origin(), nmo_pk0.origin(), nel_pk.origin(), nel_pk0.origin(), qk_to_k2.origin(), KaKjw.origin(), QKajw.origin()); // Just captured from output. REQUIRE(QKajw[1][2][3][4][4] == Approx(1904.0)); diff --git a/src/AFQMC/SlaterDeterminantOperations/SlaterDetOperations_serial.hpp b/src/AFQMC/SlaterDeterminantOperations/SlaterDetOperations_serial.hpp index 73ff2e0ba6..bc43967aaa 100644 --- a/src/AFQMC/SlaterDeterminantOperations/SlaterDetOperations_serial.hpp +++ b/src/AFQMC/SlaterDeterminantOperations/SlaterDetOperations_serial.hpp @@ -180,7 +180,7 @@ class SlaterDetOperations_serial : public SlaterDetOperations_base +inline void apply_expM_noncollinear(const MatA& V, MatB&& S, MatC& T1, MatC& T2, + int order = 6, char TA = 'N') +{ + static_assert(std::decay::type::dimensionality == 3, " batched::apply_expM::dimenionality == 3"); + static_assert(std::decay::type::dimensionality == 3, " batched::apply_expM::dimenionality == 3"); + static_assert(std::decay::type::dimensionality == 3, " batched::apply_expM::dimenionality == 3"); + assert(V.size(0)*2 == S.size(0)); + assert(V.size(0)*2 == T1.size(0)); + assert(V.size(0)*2 == T2.size(0)); + assert(V.size(1) == V.size(2)); + assert(V.size(2) == S.size(1)); + assert(S.size(1) == T1.size(1)); + assert(S.size(2) == T1.size(2)); + assert(S.size(1) == T2.size(1)); + assert(S.size(2) == T2.size(2)); + // for now limit to continuous + assert(S.stride(0) == S.size(1) * S.size(2)); + assert(T1.stride(0) == T1.size(1) * T1.size(2)); + assert(T2.stride(0) == T2.size(1) * T2.size(2)); + assert(S.stride(1) == S.size(2)); + assert(T1.stride(1) == T1.size(2)); + assert(T2.stride(1) == T2.size(2)); + assert(S.stride(2) == 1); + assert(T1.stride(2) == 1); + assert(T2.stride(2) == 1); + + using ComplexType = typename std::decay::type::element; + ComplexType zero(0.); + ComplexType im(0.0, 1.0); + if (TA == 'H' || TA == 'h') + im = ComplexType(0.0, -1.0); + auto pT1(std::addressof(T1)); + auto pT2(std::addressof(T2)); + + using pointerA = typename std::decay::type::element_const_ptr; + using pointerC = typename std::decay::type::element_ptr; + + int nbatch = S.size(0); + int ldv = V.stride(1); + int M = T2.size(2); + int N = T2.size(1); + int K = T1.size(1); + + std::vector Vi; + std::vector T1i; + std::vector T2i; + Vi.reserve(2*V.size(0)); + T1i.reserve(T1.size(0)); + T2i.reserve(T2.size(0)); + for (int i = 0; i < V.size(0); i++) { + Vi.emplace_back(ma::pointer_dispatch(V[i].origin())); + Vi.emplace_back(ma::pointer_dispatch(V[i].origin())); + } + for (int i = 0; i < T1.size(0); i++) + T1i.emplace_back(ma::pointer_dispatch(T1[i].origin())); + for (int i = 0; i < T2.size(0); i++) + T2i.emplace_back(ma::pointer_dispatch(T2[i].origin())); + + auto pT1i(std::addressof(T1i)); + auto pT2i(std::addressof(T2i)); + + // getting around issue in multi, fix later + //T1 = S; + using std::copy_n; + copy_n(S.origin(), S.num_elements(), T1.origin()); + for (int n = 1; n <= order; n++) + { + ComplexType fact = im * static_cast(1.0 / static_cast(n)); + using ma::gemmBatched; + // careful with fortran ordering + gemmBatched('N',TA,M,N,K,fact, pT1i->data(), (*pT1).stride(1), Vi.data(), ldv, + zero, pT2i->data(), (*pT2).stride(1), nbatch); + using ma::axpy; + axpy(S.num_elements(), ComplexType(1.0), (*pT2).origin(), 1, S.origin(), 1); + std::swap(pT1, pT2); + std::swap(pT1i, pT2i); + } +} + } // namespace batched } // namespace SlaterDeterminantOperations diff --git a/src/AFQMC/SlaterDeterminantOperations/rotate.hpp b/src/AFQMC/SlaterDeterminantOperations/rotate.hpp index cdf85807d4..2c04324eb8 100644 --- a/src/AFQMC/SlaterDeterminantOperations/rotate.hpp +++ b/src/AFQMC/SlaterDeterminantOperations/rotate.hpp @@ -779,7 +779,7 @@ void getLakn_Lank_from_Lkin(MultiArray2DA&& Aai, // Lakn[a][sk][n] = sum_i Aai[as][i] conj(Lkin[k][i][n]) for (int k = 0; k < nmo; k++) { - ma::product(ma::H(Lkin[k].sliced(0, ni)), ma::T(Aai), bnas); + ma::product(ma::H(Lkin[k].sliced(0, ni)), ma::T(Aas_i), bnas); for (int a = 0; a < na; a++) for (int p = 0; p < npol; p++) Lakn[a][p*nmo+k] = bnas({0, nchol}, a*npol+p); From f86d1df71f0c887cb576c69ff70e6c58a7bf3125 Mon Sep 17 00:00:00 2001 From: mmorale3 Date: Wed, 16 Sep 2020 15:58:28 -0700 Subject: [PATCH 08/10] small bug fix on single precision gpu build --- .../HamiltonianOperations/KP3IndexFactorization_batched.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/AFQMC/HamiltonianOperations/KP3IndexFactorization_batched.hpp b/src/AFQMC/HamiltonianOperations/KP3IndexFactorization_batched.hpp index 5d20c41780..bbdb453dee 100644 --- a/src/AFQMC/HamiltonianOperations/KP3IndexFactorization_batched.hpp +++ b/src/AFQMC/HamiltonianOperations/KP3IndexFactorization_batched.hpp @@ -571,7 +571,7 @@ class KP3IndexFactorization_batched C3Tensor_ref haj_K(make_device_ptr(haj[nd * nkpts + K].origin()), {nocc_max, npol, nmo_max}); for (int a = 0; a < nelpk[nd][K]; ++a) for (int pol = 0; pol < npol; ++pol) - ma::product(ComplexType(1.), ma::T(G3Da[(na + a)*npol+p].sliced(nk, nk + nopk[K])), + ma::product(ComplexType(1.), ma::T(G3Da[(na + a)*npol+pol].sliced(nk, nk + nopk[K])), haj_K[a][pol].sliced(0, nopk[K]), ComplexType(1.), E({0, nwalk}, 0)); na += nelpk[nd][K]; if (walker_type == COLLINEAR) From 288629efdcbcf6167edb13eb48c95edf1917a9ac Mon Sep 17 00:00:00 2001 From: mmorale3 Date: Tue, 29 Sep 2020 18:25:32 -0700 Subject: [PATCH 09/10] fix small bug with padding of H1 in GPU --- .../KP3IndexFactorization_batched.hpp | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/src/AFQMC/HamiltonianOperations/KP3IndexFactorization_batched.hpp b/src/AFQMC/HamiltonianOperations/KP3IndexFactorization_batched.hpp index bbdb453dee..a23b02a3ac 100644 --- a/src/AFQMC/HamiltonianOperations/KP3IndexFactorization_batched.hpp +++ b/src/AFQMC/HamiltonianOperations/KP3IndexFactorization_batched.hpp @@ -568,22 +568,23 @@ class KP3IndexFactorization_batched for (int K = 0; K < nkpts; ++K) { #if defined(MIXED_PRECISION) - C3Tensor_ref haj_K(make_device_ptr(haj[nd * nkpts + K].origin()), {nocc_max, npol, nmo_max}); - for (int a = 0; a < nelpk[nd][K]; ++a) + int ni(nopk[K]); + CMatrix_ref haj_K(make_device_ptr(haj[nd * nkpts + K].origin()), {nocc_max, npol*nmo_max}); + for (int a = 0; a < nelpk[nd][K]; ++a) for (int pol = 0; pol < npol; ++pol) - ma::product(ComplexType(1.), ma::T(G3Da[(na + a)*npol+pol].sliced(nk, nk + nopk[K])), - haj_K[a][pol].sliced(0, nopk[K]), ComplexType(1.), E({0, nwalk}, 0)); + ma::product(ComplexType(1.), ma::T(G3Da[(na + a)*npol+pol].sliced(nk, nk + ni)), + haj_K[a].sliced(pol*ni, pol*ni+ni), ComplexType(1.), E({0, nwalk}, 0)); na += nelpk[nd][K]; if (walker_type == COLLINEAR) { boost::multi::array_ref haj_Kb(haj_K.origin() + haj_K.num_elements(), {nocc_max, nmo_max}); for (int b = 0; b < nelpk[nd][nkpts + K]; ++b) - ma::product(ComplexType(1.), ma::T(G3Db[nb + b].sliced(nk, nk + nopk[K])), haj_Kb[b].sliced(0, nopk[K]), + ma::product(ComplexType(1.), ma::T(G3Db[nb + b].sliced(nk, nk + ni)), haj_Kb[b].sliced(0, ni), ComplexType(1.), E({0, nwalk}, 0)); nb += nelpk[nd][nkpts + K]; } - nk += nopk[K]; + nk += ni; #else nk = nopk[K]; { From 05fb232c11456e459a8595b8ab7847cb705c7dd3 Mon Sep 17 00:00:00 2001 From: mmorale3 Date: Fri, 2 Oct 2020 11:11:08 -0700 Subject: [PATCH 10/10] small bug fix --- .../HamiltonianOperations/tests/test_hamiltonian_operations.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/AFQMC/HamiltonianOperations/tests/test_hamiltonian_operations.cpp b/src/AFQMC/HamiltonianOperations/tests/test_hamiltonian_operations.cpp index 066d3290d2..8adf8c9f00 100644 --- a/src/AFQMC/HamiltonianOperations/tests/test_hamiltonian_operations.cpp +++ b/src/AFQMC/HamiltonianOperations/tests/test_hamiltonian_operations.cpp @@ -247,7 +247,7 @@ void ham_ops_basic_serial(boost::mpi3::communicator& world) ComplexType Xsum = 0, Xsum2=0; for(int i=0; i1e-8) { REQUIRE( real(Xsum) == Approx(real(file_data.Xsum)) );