diff --git a/CMakeLists.txt b/CMakeLists.txt index 86534f6294..a70d18c237 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -244,9 +244,6 @@ SET(BUILD_AFQMC_WITH_NCCL 0 CACHE BOOL "Build AFQMC with NCCL library.") If (BUILD_AFQMC AND NOT QMC_MPI) MESSAGE(FATAL_ERROR "AFQMC requires building with MPI (QMC_MPI=1). Set BUILD_AFQMC=0 or configure MPI.") ENDIF() -If (BUILD_AFQMC AND ENABLE_CUDA AND NOT QMC_COMPLEX) - MESSAGE(FATAL_ERROR "AFQMC only works on GPUs if QMC_COMPLEX=1.") -ENDIF() SET(BUILD_FCIQMC 0 CACHE BOOL "Build with FCIQMC") #SET(BUILD_QMCTOOLS 1 CACHE BOOL "Build tools for QMCPACK") #SET(MPIP_PROFILE 0 CACHE BOOL "Build with mpip for mpi profile") diff --git a/external_codes/mpi_wrapper/mpi3/shared_window.hpp b/external_codes/mpi_wrapper/mpi3/shared_window.hpp index fbe6e3c010..60e4af39ef 100644 --- a/external_codes/mpi_wrapper/mpi3/shared_window.hpp +++ b/external_codes/mpi_wrapper/mpi3/shared_window.hpp @@ -9,9 +9,9 @@ #include "../mpi3/dynamic_window.hpp" #include "../mpi3/group.hpp" -#include -#include -#include +//#include +//#include +//#include #include @@ -20,9 +20,9 @@ namespace mpi3{ template struct shared_window : window{ -// shared_communicator& comm_; - shared_window(shared_communicator& comm, mpi3::size_t n, int disp_unit = alignof(T)) //: //sizeof(T)) : // here we assume that disp_unit is used for align - //window()//, comm_{comm} + shared_communicator& comm_; + shared_window(shared_communicator& comm, mpi3::size_t n, int disp_unit = alignof(T)) : //sizeof(T)) : // here we assume that disp_unit is used for align + window(), comm_{comm} { void* base_ptr = nullptr; auto e = static_cast( @@ -37,12 +37,12 @@ struct shared_window : window{ shared_window(comm, 0, disp_unit) {} shared_window(shared_window const&) = default; - shared_window(shared_window&& other) noexcept : window{std::move(other)}//, comm_{other.comm_} + shared_window(shared_window&& other) noexcept : window{std::move(other)}, comm_{other.comm_} {} group get_group() const{ group r; MPI3_CALL(MPI_Win_get_group)(this->impl_, &(&r)); return r; } -// shared_communicator& get_communicator() const{return comm_;} + shared_communicator& get_communicator() const{return comm_;} struct query_t{ mpi3::size_t size; int disp_unit; diff --git a/src/AFQMC/CMakeLists.txt b/src/AFQMC/CMakeLists.txt index 297c138862..8038740616 100644 --- a/src/AFQMC/CMakeLists.txt +++ b/src/AFQMC/CMakeLists.txt @@ -16,6 +16,8 @@ SET (AFQMC_SRCS Hamiltonians/HSPotential_Helpers.cpp Hamiltonians/FactorizedSparseHamiltonian.cpp Hamiltonians/KPFactorizedHamiltonian.cpp + Hamiltonians/RealDenseHamiltonian.cpp + Hamiltonians/RealDenseHamiltonian_v2.cpp # Hamiltonians/KPTHCHamiltonian.cpp Hamiltonians/HamiltonianFactory_Helper.cpp Hamiltonians/HamiltonianFactory.cpp @@ -47,6 +49,8 @@ IF(ENABLE_CUDA) Numerics/detail/CUDA/Kernels/KaKjw_to_KKwaj.cu Numerics/detail/CUDA/Kernels/batched_dot_wabn_wban.cu Numerics/detail/CUDA/Kernels/batched_Tab_to_Klr.cu + Numerics/detail/CUDA/Kernels/dot_wabn.cu + Numerics/detail/CUDA/Kernels/Tab_to_Kl.cu Numerics/detail/CUDA/Kernels/sampleGaussianRNG.cu Numerics/detail/CUDA/Kernels/construct_X.cu Numerics/detail/CUDA/Kernels/reference_operations.cu diff --git a/src/AFQMC/Estimators/Observables/n2r.hpp b/src/AFQMC/Estimators/Observables/n2r.hpp index 54421b6330..4dd65ab19f 100644 --- a/src/AFQMC/Estimators/Observables/n2r.hpp +++ b/src/AFQMC/Estimators/Observables/n2r.hpp @@ -82,14 +82,16 @@ class n2r: public AFQMCInfo n2r(afqmc::TaskGroup_& tg_, AFQMCInfo& info, xmlNodePtr cur, WALKER_TYPES wlk, bool host_mem, aux_Allocator alloc_, aux_Allocator orb_alloc_, int nave_=1, int bsize=1): - AFQMCInfo(info),aux_alloc(alloc_),TG(tg_),walker_type(wlk),writer(false), - block_size(bsize),nave(nave_),counter(0),use_host_memory(host_mem), + AFQMCInfo(info),aux_alloc(alloc_),block_size(bsize),nave(nave_),counter(0),TG(tg_), + walker_type(wlk),dm_size(0),writer(false), + use_host_memory(host_mem), hdf_walker_output(""), - denom(iextensions<1u>{0},shared_allocator{TG.TG_local()}), - DMWork({0,0},shared_allocator{TG.TG_local()}), Orbitals({0,0},orb_alloc_), - Buff(iextensions<1u>{0},alloc_), - DMAverage({0,0},shared_allocator{TG.TG_local()}) + DMAverage({0,0},shared_allocator{TG.TG_local()}), + DMWork({0,0},shared_allocator{TG.TG_local()}), + denom(iextensions<1u>{0},shared_allocator{TG.TG_local()}), + Buff(iextensions<1u>{0},alloc_), + Buff2(iextensions<1u>{0}) { app_log()<<" -- Adding Back Propagated on-top pair density (N2R) estimator. -- \n "; diff --git a/src/AFQMC/HamiltonianOperations/HamiltonianOperations.hpp b/src/AFQMC/HamiltonianOperations/HamiltonianOperations.hpp index 027037367b..b034a8c929 100644 --- a/src/AFQMC/HamiltonianOperations/HamiltonianOperations.hpp +++ b/src/AFQMC/HamiltonianOperations/HamiltonianOperations.hpp @@ -28,6 +28,10 @@ #include "AFQMC/HamiltonianOperations/KP3IndexFactorization_batched.hpp" #include "AFQMC/HamiltonianOperations/KP3IndexFactorization_batched_ooc.hpp" //#include "AFQMC/HamiltonianOperations/KPTHCOps.hpp" +#else +#include "AFQMC/HamiltonianOperations/Real3IndexFactorization.hpp" +#include "AFQMC/HamiltonianOperations/Real3IndexFactorization_batched.hpp" +#include "AFQMC/HamiltonianOperations/Real3IndexFactorization_batched_v2.hpp" #endif namespace qmcplusplus @@ -164,7 +168,10 @@ class HamiltonianOperations: SparseTensor, SparseTensor, SparseTensor, - SparseTensor + SparseTensor, + Real3IndexFactorization, + Real3IndexFactorization_batched, + Real3IndexFactorization_batched_v2 > #endif { @@ -182,6 +189,9 @@ class HamiltonianOperations: explicit HamiltonianOperations(STRR&& other) : variant(std::move(other)) {} explicit HamiltonianOperations(STRC&& other) : variant(std::move(other)) {} explicit HamiltonianOperations(STCR&& other) : variant(std::move(other)) {} + explicit HamiltonianOperations(Real3IndexFactorization&& other) : variant(std::move(other)) {} + explicit HamiltonianOperations(Real3IndexFactorization_batched&& other) : variant(std::move(other)) {} + explicit HamiltonianOperations(Real3IndexFactorization_batched_v2&& other) : variant(std::move(other)) {} #else explicit HamiltonianOperations(KP3IndexFactorization&& other) : variant(std::move(other)) {} explicit HamiltonianOperations(KP3IndexFactorization_batched&& other) : variant(std::move(other)) {} @@ -195,6 +205,9 @@ class HamiltonianOperations: explicit HamiltonianOperations(STRR const& other) = delete; explicit HamiltonianOperations(STRC const& other) = delete; explicit HamiltonianOperations(STCR const& other) = delete; + explicit HamiltonianOperations(Real3IndexFactorization const& other) = delete; + explicit HamiltonianOperations(Real3IndexFactorization_batched const& other) = delete; + explicit HamiltonianOperations(Real3IndexFactorization_batched_v2 const& other) = delete; #else explicit HamiltonianOperations(KP3IndexFactorization const& other) = delete; explicit HamiltonianOperations(KP3IndexFactorization_batched const& other) = delete; diff --git a/src/AFQMC/HamiltonianOperations/Real3IndexFactorization.hpp b/src/AFQMC/HamiltonianOperations/Real3IndexFactorization.hpp new file mode 100644 index 0000000000..a9c9c820d3 --- /dev/null +++ b/src/AFQMC/HamiltonianOperations/Real3IndexFactorization.hpp @@ -0,0 +1,566 @@ +////////////////////////////////////////////////////////////////////// +// This file is distributed under the University of Illinois/NCSA Open Source +// License. See LICENSE file in top directory for details. +// +// Copyright (c) 2016 Jeongnim Kim and QMCPACK developers. +// +// File developed by: +// Miguel A. Morales, moralessilva2@llnl.gov +// Lawrence Livermore National Laboratory +// +// File created by: +// Miguel A. Morales, moralessilva2@llnl.gov +// Lawrence Livermore National Laboratory +//////////////////////////////////////////////////////////////////////////////// + +#ifndef QMCPLUSPLUS_AFQMC_HAMILTONIANOPERATIONS_REAL3INDEXFACTORIZATION_HPP +#define QMCPLUSPLUS_AFQMC_HAMILTONIANOPERATIONS_REAL3INDEXFACTORIZATION_HPP + +#include +#include +#include + +#include "Configuration.h" +#include "AFQMC/config.h" +#include "mpi3/shared_communicator.hpp" +#include "multi/array.hpp" +#include "multi/array_ref.hpp" +#include "AFQMC/Numerics/ma_operations.hpp" + +#include "AFQMC/Utilities/type_conversion.hpp" +#include "AFQMC/Utilities/taskgroup.h" + +namespace qmcplusplus +{ + +namespace afqmc +{ + +// Custom implementation for real build +class Real3IndexFactorization +{ + + using IVector = boost::multi::array; + using CVector = boost::multi::array; + using SpVector = boost::multi::array; + + using CMatrix = boost::multi::array; + using CMatrix_cref = boost::multi::array_cref; + using CMatrix_ref = boost::multi::array_ref; + using CVector_ref = boost::multi::array_ref; + + using RMatrix = boost::multi::array; + using RMatrix_cref = boost::multi::array_cref; + using RMatrix_ref = boost::multi::array_ref; + using RVector_ref = boost::multi::array_ref; + + using SpCMatrix = boost::multi::array; + using SpCMatrix_cref = boost::multi::array_cref; + using SpCVector_ref = boost::multi::array_ref; + using SpCMatrix_ref = boost::multi::array_ref; + + using SpRMatrix = boost::multi::array; + using SpRMatrix_cref = boost::multi::array_cref; + using SpRVector_ref = boost::multi::array_ref; + using SpRMatrix_ref = boost::multi::array_ref; + + using C3Tensor = boost::multi::array; + using SpC3Tensor = boost::multi::array; + using SpC3Tensor_ref = boost::multi::array_ref; + using SpC4Tensor_ref = boost::multi::array_ref; + + using shmCVector = boost::multi::array>; + using shmRMatrix = boost::multi::array>; + using shmCMatrix = boost::multi::array>; + using shmC3Tensor = boost::multi::array>; + + using shmSpRVector = boost::multi::array>; + using shmSpRMatrix = boost::multi::array>; + using shmSpCMatrix = boost::multi::array>; + using shmSpC3Tensor = boost::multi::array>; + + using this_t = Real3IndexFactorization; + + public: + + Real3IndexFactorization(afqmc::TaskGroup_& tg_, + WALKER_TYPES type, + shmRMatrix&& hij_, + shmCMatrix&& haj_, + shmSpRMatrix&& vik, + shmSpCMatrix&& vak, + std::vector&& vank, + shmCMatrix&& vn0_, + ValueType e0_, + int cv0, + int gncv): + TG(tg_), + walker_type(type), + global_origin(cv0), + global_nCV(gncv), + local_nCV(0), + E0(e0_), + hij(std::move(hij_)), + haj(std::move(haj_)), + Likn(std::move(vik)), + Lank(std::move(vank)), + Lakn(std::move(vak)), + vn0(std::move(vn0_)), + SM_TMats({1,1},shared_allocator{TG.TG_local()}) + { + local_nCV=Likn.size(1); + TG.Node().barrier(); + } + + ~Real3IndexFactorization() {} + + Real3IndexFactorization(const Real3IndexFactorization& other) = delete; + Real3IndexFactorization& operator=(const Real3IndexFactorization& other) = delete; + Real3IndexFactorization(Real3IndexFactorization&& other) = default; + Real3IndexFactorization& operator=(Real3IndexFactorization&& other) = default; + + CMatrix getOneBodyPropagatorMatrix(TaskGroup_& TG, boost::multi::array const& vMF) + { + int NMO = hij.size(0); + // in non-collinear case with SO, keep SO matrix here and add it + // for now, stay collinear + CMatrix H1({NMO,NMO}); + + // add sum_n vMF*Spvn, vMF has local contribution only! + boost::multi::array_ref H1D(H1.origin(),{NMO*NMO}); + std::fill_n(H1D.origin(),H1D.num_elements(),ComplexType(0)); + vHS(vMF, H1D); + TG.TG().all_reduce_in_place_n(H1D.origin(),H1D.num_elements(),std::plus<>()); + + // add hij + vn0 and symmetrize + using ma::conj; + + for(int i=0; i 1e-6 ) { + app_error()<<" WARNING in getOneBodyPropagatorMatrix. H1 is not hermitian. \n"; + app_error()< + void energy(Mat&& E, MatB const& G, int k, bool addH1=true, bool addEJ=true, bool addEXX=true) { + MatB* Kr(nullptr); + MatB* Kl(nullptr); + energy(E,G,k,Kl,Kr,addH1,addEJ,addEXX); + } + + // KEleft and KEright must be in shared memory for this to work correctly + template + void energy(Mat&& E, MatB const& Gc, int nd, MatC* KEleft, MatD* KEright, bool addH1=true, bool addEJ=true, bool addEXX=true) { + assert(E.size(1)>=3); + assert(nd >= 0); + assert(nd < haj.size()); + if(walker_type==COLLINEAR) + assert(2*nd+1 < Lank.size()); + else + assert(nd < Lank.size()); + + int nwalk = Gc.size(0); + int nspin = (walker_type==COLLINEAR?2:1); + int NMO = hij.size(0); + int nel[2]; + nel[0] = Lank[nspin*nd].size(0); + nel[1] = ((nspin==2)?Lank[nspin*nd+1].size(0):0); + assert(Lank[nspin*nd].size(1) == local_nCV); + assert(Lank[nspin*nd].size(2) == NMO); + if(nspin==2) { + assert(Lank[nspin*nd+1].size(1) == local_nCV); + assert(Lank[nspin*nd+1].size(2) == NMO); + } + assert(Gc.num_elements() == nwalk*(nel[0]+nel[1])*NMO); + + int getKr = KEright!=nullptr; + int getKl = KEleft!=nullptr; + if(E.size(0) != nwalk || E.size(1) < 3) + APP_ABORT(" Error in AFQMC/HamiltonianOperations/Real3IndexFactorization::energy(...). Incorrect matrix dimensions \n"); + + // T[nwalk][nup][nup][local_nCV] + D[nwalk][nwalk][local_nCV] + size_t mem_needs(0); + size_t cnt(0); + if(addEJ) { +#if MIXED_PRECISION + mem_needs += nwalk*local_nCV; +#else + if(not getKl) mem_needs += nwalk*local_nCV; +#endif + } + if(addEXX) { + mem_needs += nwalk*nel[0]*nel[0]*local_nCV; +#if MIXED_PRECISION + mem_needs += nwalk*nel[0]*NMO; +#else + if(nspin == 2) mem_needs += nwalk*nel[0]*NMO; +#endif + } + set_shm_buffer(mem_needs); + + // messy + SPComplexType *Klptr(nullptr); + long Knr=0, Knc=0; + if(addEJ) { + Knr=nwalk; + Knc=local_nCV; + if(getKr) { + assert(KEright->size(0) == nwalk && KEright->size(1) == local_nCV); + assert(KEright->stride(0) == KEright->size(1)); + } +#if MIXED_PRECISION + if(getKl) { + assert(KEleft->size(0) == nwalk && KEleft->size(1) == local_nCV); + assert(KEleft->stride(0) == KEleft->size(1)); + } +#else + if(getKl) { + assert(KEleft->size(0) == nwalk && KEleft->size(1) == local_nCV); + assert(KEleft->stride(0) == KEleft->size(1)); + Klptr = to_address(KEleft->origin()); + } else +#endif + { + Klptr = to_address(SM_TMats.origin())+cnt; + cnt += Knr*Knc; + } + if(TG.TG_local().root()) std::fill_n(Klptr,Knr*Knc,SPComplexType(0.0)); + } else if(getKr or getKl) { + APP_ABORT(" Error: Kr and/or Kl can only be calculated with addEJ=true.\n"); + } + SpCMatrix_ref Kl(Klptr,{long(Knr),long(Knc)}); + + for(int n=0; n haj_ref(to_address(haj[nd].origin()), iextensions<1u>{haj[nd].num_elements()}); + ma::product(ComplexType(1.),Gc,haj_ref,ComplexType(1.),E(E.extension(0),0)); + for(int i=0; i(ma::dot(T4Dwban[n][a][b],T4Dwban[n][b][a])); + } + E[n][1] -= 0.5*scl*E_; + } + + if(addEJ) { + for(int n=0; n(scl*scl*ma::dot(Kl[n],Kl[n])); + } +#if MIXED_PRECISION + if(getKl) { + long i0, iN; + std::tie(i0,iN) = FairDivideBoundary(long(TG.TG_local().rank()),long(KEleft->num_elements()), + long(TG.TG_local().size())); + copy_n_cast(Klptr+i0,iN-i0,to_address(KEleft->origin())+i0); + } +#endif + if(getKr) { + long i0, iN; + std::tie(i0,iN) = FairDivideBoundary(long(TG.TG_local().rank()),long(KEright->num_elements()), + long(TG.TG_local().size())); + copy_n_cast(Klptr+i0,iN-i0,to_address(KEright->origin())+i0); + } + TG.TG_local().barrier(); + } + } + + template + void fast_energy(Args&&... args) + { + APP_ABORT(" Error: fast_energy not implemented in Real3IndexFactorization. \n"); + } + + template::type::dimensionality==1)>, + typename = typename std::enable_if_t<(std::decay::type::dimensionality==1)>, + typename = void + > + void vHS(MatA& X, MatB&& v, double a=1., double c=0.) { + using BType = typename std::decay::type::element ; + using AType = typename std::decay::type::element ; + boost::multi::array_ref v_(to_address(v.origin()), + {v.size(0),1}); + boost::multi::array_ref X_(to_address(X.origin()), + {X.size(0),1}); + return vHS(X_,v_,a,c); + } + + template::type::dimensionality==2)>, + typename = typename std::enable_if_t<(std::decay::type::dimensionality==2)> + > + void vHS(MatA& X, MatB&& v, double a=1., double c=0.) { + assert( Likn.size(1) == X.size(0) ); + assert( Likn.size(0) == v.size(0) ); + assert( X.size(1) == v.size(1) ); + long ik0, ikN; + std::tie(ik0,ikN) = FairDivideBoundary(long(TG.TG_local().rank()),long(Likn.size(0)),long(TG.TG_local().size())); +#if MIXED_PRECISION + size_t mem_needs = X.num_elements()+v.num_elements(); + set_shm_buffer(mem_needs); + boost::multi::array_ref vsp(to_address(SM_TMats.origin()), v.extensions()); + boost::multi::array_ref Xsp(vsp.origin()+vsp.num_elements(), X.extensions()); + long i0, iN; + std::tie(i0,iN) = FairDivideBoundary(long(TG.TG_local().rank()),long(X.num_elements()),long(TG.TG_local().size())); + copy_n_cast(to_address(X.origin())+i0,iN-i0,to_address(Xsp.origin())+i0); + TG.TG_local().barrier(); + ma::product(SPValueType(a),Likn.sliced(ik0,ikN),Xsp, + SPValueType(c),vsp.sliced(ik0,ikN)); + copy_n_cast(to_address(vsp[ik0].origin()),vsp.size(1)*(ikN-ik0), + to_address(v[ik0].origin())); +#else + ma::product(SPValueType(a),Likn.sliced(ik0,ikN),X, + SPValueType(c),v.sliced(ik0,ikN)); +#endif + TG.TG_local().barrier(); + } + + template::type::dimensionality==1)>, + typename = typename std::enable_if_t<(std::decay::type::dimensionality==1)>, + typename = void + > + void vbias(const MatA& G, MatB&& v, double a=1., double c=0., int k=0) { + using BType = typename std::decay::type::element ; + using AType = typename std::decay::type::element ; + boost::multi::array_ref v_(to_address(v.origin()), + {v.size(0),1}); + if(haj.size(0) == 1) { + + boost::multi::array_cref G_(to_address(G.origin()), + {1,G.size(0)}); + return vbias(G_,v_,a,c,k); + + } else { + + boost::multi::array_cref G_(to_address(G.origin()), + {G.size(0),1}); + return vbias(G_,v_,a,c,k); + } + } + + // v(n,w) = sum_ak L(ak,n) G(w,ak) + template::type::dimensionality==2)>, + typename = typename std::enable_if_t<(std::decay::type::dimensionality==2)> + > + void vbias(const MatA& G, MatB&& v, double a=1., double c=0., int k=0) { + if(haj.size(0) == 1) { + assert( Lakn.size(0) == G.size(1) ); + assert( Lakn.size(1) == v.size(0) ); + assert( G.size(0) == v.size(1) ); + long ic0, icN; + std::tie(ic0,icN) = FairDivideBoundary(long(TG.TG_local().rank()),long(Lakn.size(1)),long(TG.TG_local().size())); + +#if MIXED_PRECISION + size_t mem_needs = G.num_elements()+v.num_elements(); + set_shm_buffer(mem_needs); + boost::multi::array_ref vsp(to_address(SM_TMats.origin()), v.extensions()); + boost::multi::array_ref Gsp(vsp.origin()+vsp.num_elements(), G.extensions()); + long i0, iN; + std::tie(i0,iN) = FairDivideBoundary(long(TG.TG_local().rank()),long(G.num_elements()),long(TG.TG_local().size())); + copy_n_cast(to_address(G.origin())+i0,iN-i0,to_address(Gsp.origin())+i0); + TG.TG_local().barrier(); + if(walker_type==CLOSED) a*=2.0; + ma::product(SPValueType(a),ma::T(Lakn(Lakn.extension(0),{ic0,icN})),ma::T(Gsp), + SPValueType(c),vsp.sliced(ic0,icN)); + copy_n_cast(to_address(vsp[ic0].origin()),vsp.size(1)*(icN-ic0), + to_address(v[ic0].origin())); + TG.TG_local().barrier(); +#else + if(walker_type==CLOSED) a*=2.0; + ma::product(SPValueType(a),ma::T(Lakn(Lakn.extension(0),{ic0,icN})),ma::T(G), + SPValueType(c),v.sliced(ic0,icN)); +#endif + } else { + // multideterminant is not half-rotated, so use Likn + assert( Likn.size(0) == G.size(0) ); + assert( Likn.size(1) == v.size(0) ); + assert( G.size(1) == v.size(1) ); + long ic0, icN; + std::tie(ic0,icN) = FairDivideBoundary(long(TG.TG_local().rank()),long(Likn.size(1)),long(TG.TG_local().size())); + +#if MIXED_PRECISION + size_t mem_needs = G.num_elements()+v.num_elements(); + set_shm_buffer(mem_needs); + boost::multi::array_ref vsp(to_address(SM_TMats.origin()), v.extensions()); + boost::multi::array_ref Gsp(vsp.origin()+vsp.num_elements(), G.extensions()); + long i0, iN; + std::tie(i0,iN) = FairDivideBoundary(long(TG.TG_local().rank()),long(G.num_elements()),long(TG.TG_local().size())); + copy_n_cast(to_address(G.origin())+i0,iN-i0,to_address(Gsp.origin())+i0); + TG.TG_local().barrier(); + if(walker_type==CLOSED) a*=2.0; + ma::product(SPValueType(a),ma::T(Likn(Likn.extension(0),{ic0,icN})),Gsp, + SPValueType(c),vsp.sliced(ic0,icN)); + copy_n_cast(to_address(vsp[ic0].origin()),vsp.size(1)*(icN-ic0), + to_address(v[ic0].origin())); + TG.TG_local().barrier(); +#else + if(walker_type==CLOSED) a*=2.0; + ma::product(SPValueType(a),ma::T(Likn(Likn.extension(0),{ic0,icN})),G, + SPValueType(c),v.sliced(ic0,icN)); +#endif + } + TG.TG_local().barrier(); + } + + bool distribution_over_cholesky_vectors() const{ return true; } + int number_of_ke_vectors() const{ return local_nCV; } + int local_number_of_cholesky_vectors() const{ return local_nCV; } + int global_number_of_cholesky_vectors() const{ return global_nCV; } + int global_origin_cholesky_vector() const{ return global_origin; } + + // transpose=true means G[nwalk][ik], false means G[ik][nwalk] + bool transposed_G_for_vbias() const{ return (haj.size(0) == 1); } + bool transposed_G_for_E() const{return true;} + // transpose=true means vHS[nwalk][ik], false means vHS[ik][nwalk] + bool transposed_vHS() const{return false;} + + bool fast_ph_energy() const { return false; } + + boost::multi::array getHSPotentials() + { + return boost::multi::array{}; + } + + private: + + afqmc::TaskGroup_& TG; + + WALKER_TYPES walker_type; + + int global_origin; + int global_nCV; + int local_nCV; + + ValueType E0; + + // bare one body hamiltonian + shmRMatrix hij; + + // (potentially half rotated) one body hamiltonian + shmCMatrix haj; + + //Cholesky Tensor Lik[i][k][n] + shmSpRMatrix Likn; + + // permuted half-tranformed Cholesky tensor + // Lank[ 2*idet + ispin ] + std::vector Lank; + + // half-tranformed Cholesky tensor + // only used in single determinant case, haj.size(0)==1. + shmSpCMatrix Lakn; + + // one-body piece of Hamiltonian factorization + shmCMatrix vn0; + + // shared buffer space + // using matrix since there are issues with vectors + shmSpCMatrix SM_TMats; + + myTimer Timer; + + void set_shm_buffer(size_t N) { + if(SM_TMats.num_elements() < N) + SM_TMats.reextent({N,1}); + } + +}; + +} + +} + +#endif diff --git a/src/AFQMC/HamiltonianOperations/Real3IndexFactorization_batched.hpp b/src/AFQMC/HamiltonianOperations/Real3IndexFactorization_batched.hpp new file mode 100644 index 0000000000..09838c3956 --- /dev/null +++ b/src/AFQMC/HamiltonianOperations/Real3IndexFactorization_batched.hpp @@ -0,0 +1,557 @@ +////////////////////////////////////////////////////////////////////// +// This file is distributed under the University of Illinois/NCSA Open Source +// License. See LICENSE file in top directory for details. +// +// Copyright (c) 2016 Jeongnim Kim and QMCPACK developers. +// +// File developed by: +// Miguel A. Morales, moralessilva2@llnl.gov +// Lawrence Livermore National Laboratory +// +// File created by: +// Miguel A. Morales, moralessilva2@llnl.gov +// Lawrence Livermore National Laboratory +//////////////////////////////////////////////////////////////////////////////// + +#ifndef QMCPLUSPLUS_AFQMC_HAMILTONIANOPERATIONS_REAL3INDEXFACTORIZATION_BATCHED_HPP +#define QMCPLUSPLUS_AFQMC_HAMILTONIANOPERATIONS_REAL3INDEXFACTORIZATION_BATCHED_HPP + +#include +#include +#include + +#include "Configuration.h" +#include "AFQMC/config.h" +#include "multi/array.hpp" +#include "multi/array_ref.hpp" +#include "AFQMC/Numerics/ma_operations.hpp" + +#include "AFQMC/Utilities/type_conversion.hpp" +#include "AFQMC/Utilities/taskgroup.h" +#include "AFQMC/Utilities/Utils.hpp" +#include "AFQMC/Numerics/helpers/batched_operations.hpp" +#include "AFQMC/Numerics/tensor_operations.hpp" + +namespace qmcplusplus +{ + +namespace afqmc +{ + +// Custom implementation for real build +class Real3IndexFactorization_batched +{ + + // allocators + using Allocator = device_allocator; + using SpAllocator = device_allocator; + using Allocator_shared = node_allocator; + using SpAllocator_shared = node_allocator; + using SpRAllocator_shared = node_allocator; + + // type defs + using pointer = typename Allocator::pointer; + using sp_pointer = typename SpAllocator::pointer; + using pointer_shared = typename Allocator_shared::pointer; + using sp_pointer_shared = typename SpAllocator_shared::pointer; + using sp_rpointer_shared = typename SpRAllocator_shared::pointer; + + using CVector = ComplexVector; + using SpVector = SPComplexVector; + using SpCMatrix = SPComplexMatrix; + using CVector_ref = ComplexVector_ref; + using SpCVector_ref = SPComplexVector_ref; + using CMatrix_ref = ComplexMatrix_ref; + using SpCMatrix_ref = SPComplexMatrix_ref; + using SpC4Tensor_ref = boost::multi::array_ref; + + using shmCMatrix = ComplexMatrix; + using shmSpC3Tensor = SPComplex3Tensor; + using shmSpCMatrix = SPComplexMatrix; + using shmSpRMatrix = boost::multi::array; + + using mpi3RMatrix = boost::multi::array>; + using mpi3CMatrix = boost::multi::array>; + + public: + + template + Real3IndexFactorization_batched(WALKER_TYPES type, + mpi3RMatrix&& hij_, + shmCMatrix_&& haj_, + shmSpRMatrix_&& vik, + shmSpCMatrix_&& vak, + std::vector&& vank, + mpi3CMatrix&& vn0_, + ValueType e0_, + Allocator const& alloc_, + int cv0, + int gncv): + allocator_(alloc_), + sp_allocator_(alloc_), + walker_type(type), + global_origin(cv0), + global_nCV(gncv), + local_nCV(0), + E0(e0_), + hij(std::move(hij_)), + haj(std::move(haj_)), + Likn(std::move(vik)), + Lank(std::move(move_vector(std::move(vank)))), + Lakn(std::move(vak)), + vn0(std::move(vn0_)), + TBuff(iextensions<1u>{1},sp_allocator_) + { + local_nCV=Likn.size(1); + size_t lank(0); + for(auto& v: Lank) lank += v.num_elements(); + app_log()<<"****************************************************************** \n" + <<" Static memory usage by Real3IndexFactorization_batched (node 0 in MB) \n" + <<" Likn: " < getOneBodyPropagatorMatrix(TaskGroup_& TG, boost::multi::array const& vMF) + { + int NMO = hij.size(0); + // in non-collinear case with SO, keep SO matrix here and add it + // for now, stay collinear + + CVector vMF_(vMF); + CVector P1D(iextensions<1u>{NMO*NMO}); + fill_n(P1D.origin(),P1D.num_elements(),ComplexType(0)); + vHS(vMF_, P1D); + if(TG.TG().size() > 1) + TG.TG().all_reduce_in_place_n(to_address(P1D.origin()),P1D.num_elements(),std::plus<>()); + + boost::multi::array P1({NMO,NMO}); + copy_n(P1D.origin(),NMO*NMO,P1.origin()); + + using ma::conj; + + for(int i=0; i 1e-6 ) { + app_error()<<" WARNING in getOneBodyPropagatorMatrix. P1 is not hermitian. \n"; + app_error()< + void energy(Mat&& E, MatB const& G, int k, bool addH1=true, bool addEJ=true, bool addEXX=true) { + MatB* Kr(nullptr); + MatB* Kl(nullptr); + energy(E,G,k,Kl,Kr,addH1,addEJ,addEXX); + } + + // KEleft and KEright must be in shared memory for this to work correctly + template + void energy(Mat&& E, MatB const& Gc, int nd, MatC* KEleft, MatD* KEright, bool addH1=true, bool addEJ=true, bool addEXX=true) { + assert(E.size(1)>=3); + assert(nd >= 0); + assert(nd < haj.size()); + if(walker_type==COLLINEAR) + assert(2*nd+1 < Lank.size()); + else + assert(nd < Lank.size()); + + int nwalk = Gc.size(0); + int nspin = (walker_type==COLLINEAR?2:1); + int NMO = hij.size(0); + int nel[2]; + nel[0] = Lank[nspin*nd].size(0); + nel[1] = ((nspin==2)?Lank[nspin*nd+1].size(0):0); + assert(Lank[nspin*nd].size(1) == local_nCV); + assert(Lank[nspin*nd].size(2) == NMO); + if(nspin==2) { + assert(Lank[nspin*nd+1].size(1) == local_nCV); + assert(Lank[nspin*nd+1].size(2) == NMO); + } + assert(Gc.num_elements() == nwalk*(nel[0]+nel[1])*NMO); + + int getKr = KEright!=nullptr; + int getKl = KEleft!=nullptr; + if(E.size(0) != nwalk || E.size(1) < 3) + APP_ABORT(" Error in AFQMC/HamiltonianOperations/Real3IndexFactorization_batched::energy(...). Incorrect matrix dimensions \n"); + + // T[nwalk][nup][nup][local_nCV] + D[nwalk][nwalk][local_nCV] + size_t mem_needs(0); + size_t cnt(0); + if(addEJ) { +#if MIXED_PRECISION + mem_needs += nwalk*local_nCV; +#else + if(not getKl) mem_needs += nwalk*local_nCV; +#endif + } + if(addEXX) { + mem_needs += nwalk*nel[0]*nel[0]*local_nCV; +#if MIXED_PRECISION + mem_needs += nwalk*nel[0]*NMO; +#else + if(nspin == 2) mem_needs += nwalk*nel[0]*NMO; +#endif + } + set_buffer(mem_needs); + + // messy + sp_pointer Klptr(nullptr); + long Knr=0, Knc=0; + if(addEJ) { + Knr=nwalk; + Knc=local_nCV; + if(getKr) { + assert(KEright->size(0) == nwalk && KEright->size(1) == local_nCV); + assert(KEright->stride(0) == KEright->size(1)); + } +#if MIXED_PRECISION + if(getKl) { + assert(KEleft->size(0) == nwalk && KEleft->size(1) == local_nCV); + assert(KEleft->stride(0) == KEleft->size(1)); + } +#else + if(getKl) { + assert(KEleft->size(0) == nwalk && KEleft->size(1) == local_nCV); + assert(KEleft->stride(0) == KEleft->size(1)); + Klptr = make_device_ptr(KEleft->origin()); + } else +#endif + { + Klptr = TBuff.origin()+cnt; + cnt += Knr*Knc; + } + fill_n(Klptr,Knr*Knc,SPComplexType(0.0)); + } else if(getKr or getKl) { + APP_ABORT(" Error: Kr and/or Kl can only be calculated with addEJ=true.\n"); + } + SpCMatrix_ref Kl(Klptr,{long(Knr),long(Knc)}); + + for(int n=0; n{haj[nd].num_elements()}); + ma::product(ComplexType(1.),Gc,haj_ref,ComplexType(1.),E(E.extension(0),0)); + for(int i=0; i(ma::dot(T4Dwban[n][a][b],T4Dwban[n][b][a])); +// } +// E[n][1] -= 0.5*scl*E_; +// } + + if(addEJ) { +// for(int n=0; n(scl*scl*ma::dot(Kl[n],Kl[n])); +#if MIXED_PRECISION + if(getKl) + copy_n_cast(Klptr,KEleft->num_elements(),make_device_ptr(KEleft->origin())); +#endif + if(getKr) + copy_n_cast(Klptr,KEright->num_elements(),make_device_ptr(KEright->origin())); + } + } + + template + void fast_energy(Args&&... args) + { + APP_ABORT(" Error: fast_energy not implemented in Real3IndexFactorization_batched. \n"); + } + + template::type::dimensionality==1)>, + typename = typename std::enable_if_t<(std::decay::type::dimensionality==1)>, + typename = void + > + void vHS(MatA& X, MatB&& v, double a=1., double c=0.) { + assert( Likn.size(1) == X.size(0) ); + assert( Likn.size(0) == v.size(0) ); +#if MIXED_PRECISION + size_t mem_needs = X.num_elements()+v.num_elements(); + set_buffer(mem_needs); + SpCVector_ref vsp(TBuff.origin(), v.extensions()); + SpCVector_ref Xsp(vsp.origin()+vsp.num_elements(), X.extensions()); + copy_n_cast(make_device_ptr(X.origin()),X.num_elements(),Xsp.origin()); + ma::product(SPValueType(a),Likn,Xsp,SPValueType(c),vsp); + copy_n_cast(vsp.origin(),vsp.num_elements(),make_device_ptr(v.origin())); +#else + ma::product(SPValueType(a),Likn,X,SPValueType(c),v); +#endif + } + + template::type::dimensionality==2)>, + typename = typename std::enable_if_t<(std::decay::type::dimensionality==2)> + > + void vHS(MatA& X, MatB&& v, double a=1., double c=0.) { + assert( Likn.size(1) == X.size(0) ); + assert( Likn.size(0) == v.size(0) ); + assert( X.size(1) == v.size(1) ); +#if MIXED_PRECISION + size_t mem_needs = X.num_elements()+v.num_elements(); + set_buffer(mem_needs); + SpCMatrix_ref vsp(TBuff.origin(), v.extensions()); + SpCMatrix_ref Xsp(vsp.origin()+vsp.num_elements(), X.extensions()); + copy_n_cast(make_device_ptr(X.origin()),X.num_elements(),Xsp.origin()); + ma::product(SPValueType(a),Likn,Xsp,SPValueType(c),vsp); + copy_n_cast(vsp.origin(),vsp.num_elements(),make_device_ptr(v.origin())); +#else + ma::product(SPValueType(a),Likn,X,SPValueType(c),v); +#endif + } + + template::type::dimensionality==1)>, + typename = typename std::enable_if_t<(std::decay::type::dimensionality==1)>, + typename = void + > + void vbias(const MatA& G, MatB&& v, double a=1., double c=0., int k=0) { + if(haj.size(0) == 1) { + assert( Lakn.size(0) == G.size(0) ); + assert( Lakn.size(1) == v.size(0) ); + +#if MIXED_PRECISION + size_t mem_needs = G.num_elements()+v.num_elements(); + set_buffer(mem_needs); + SpCVector_ref vsp(TBuff.origin(), v.extensions()); + SpCVector_ref Gsp(vsp.origin()+vsp.num_elements(), G.extensions()); + copy_n_cast(make_device_ptr(G.origin()),G.num_elements(),Gsp.origin()); + if(walker_type==CLOSED) a*=2.0; + ma::product(SPComplexType(a),ma::T(Lakn),Gsp,SPComplexType(c),vsp); + copy_n_cast(vsp.origin(),vsp.num_elements(),make_device_ptr(v.origin())); +#else + if(walker_type==CLOSED) a*=2.0; + ma::product(SPValueType(a),ma::T(Lakn),G,SPValueType(c),v); +#endif + } else { + // multideterminant is not half-rotated, so use Likn + assert( Likn.size(0) == G.size(0) ); + assert( Likn.size(1) == v.size(0) ); + +#if MIXED_PRECISION + size_t mem_needs = G.num_elements()+v.num_elements(); + set_buffer(mem_needs); + SpCVector_ref vsp(TBuff.origin(), v.extensions()); + SpCVector_ref Gsp(vsp.origin()+vsp.num_elements(), G.extensions()); + copy_n_cast(make_device_ptr(G.origin()),G.num_elements(),Gsp.origin()); + if(walker_type==CLOSED) a*=2.0; + ma::product(SPValueType(a),ma::T(Likn),Gsp,SPValueType(c),vsp); + copy_n_cast(vsp.origin(),vsp.num_elements(),make_device_ptr(v.origin())); +#else + if(walker_type==CLOSED) a*=2.0; + ma::product(SPValueType(a),ma::T(Likn),G,SPValueType(c),v); +#endif + } + } + + // v(n,w) = sum_ak L(ak,n) G(w,ak) + template::type::dimensionality==2)>, + typename = typename std::enable_if_t<(std::decay::type::dimensionality==2)> + > + void vbias(const MatA& G, MatB&& v, double a=1., double c=0., int k=0) { + if(haj.size(0) == 1) { + assert( Lakn.size(0) == G.size(1) ); + assert( Lakn.size(1) == v.size(0) ); + assert( G.size(0) == v.size(1) ); + +#if MIXED_PRECISION + size_t mem_needs = G.num_elements()+v.num_elements(); + set_buffer(mem_needs); + SpCMatrix_ref vsp(TBuff.origin(), v.extensions()); + SpCMatrix_ref Gsp(vsp.origin()+vsp.num_elements(), G.extensions()); + copy_n_cast(make_device_ptr(G.origin()),G.num_elements(),Gsp.origin()); + if(walker_type==CLOSED) a*=2.0; + ma::product(SPComplexType(a),ma::T(Lakn),ma::T(Gsp),SPComplexType(c),vsp); + copy_n_cast(vsp.origin(),vsp.num_elements(),make_device_ptr(v.origin())); +#else + if(walker_type==CLOSED) a*=2.0; + ma::product(SPComplexType(a),ma::T(Lakn),ma::T(G),SPComplexType(c),v); +#endif + } else { + // multideterminant is not half-rotated, so use Likn + assert( Likn.size(0) == G.size(0) ); + assert( Likn.size(1) == v.size(0) ); + assert( G.size(1) == v.size(1) ); + +#if MIXED_PRECISION + size_t mem_needs = G.num_elements()+v.num_elements(); + set_buffer(mem_needs); + SpCMatrix_ref vsp(TBuff.origin(), v.extensions()); + SpCMatrix_ref Gsp(vsp.origin()+vsp.num_elements(), G.extensions()); + copy_n_cast(make_device_ptr(G.origin()),G.num_elements(),Gsp.origin()); + if(walker_type==CLOSED) a*=2.0; + ma::product(SPValueType(a),ma::T(Likn),Gsp,SPValueType(c),vsp); + copy_n_cast(vsp.origin(),vsp.num_elements(),make_device_ptr(v.origin())); +#else + if(walker_type==CLOSED) a*=2.0; + ma::product(SPValueType(a),ma::T(Likn),G,SPValueType(c),v); +#endif + } + } + + bool distribution_over_cholesky_vectors() const{ return true; } + int number_of_ke_vectors() const{ return local_nCV; } + int local_number_of_cholesky_vectors() const{ return local_nCV; } + int global_number_of_cholesky_vectors() const{ return global_nCV; } + int global_origin_cholesky_vector() const{ return global_origin; } + + // transpose=true means G[nwalk][ik], false means G[ik][nwalk] + bool transposed_G_for_vbias() const{ return (haj.size(0) == 1); } + bool transposed_G_for_E() const{return true;} + // transpose=true means vHS[nwalk][ik], false means vHS[ik][nwalk] + bool transposed_vHS() const{return false;} + + bool fast_ph_energy() const { return false; } + + boost::multi::array getHSPotentials() + { + return boost::multi::array{}; + } + + private: + + Allocator allocator_; + SpAllocator sp_allocator_; + + WALKER_TYPES walker_type; + + int global_origin; + int global_nCV; + int local_nCV; + + ValueType E0; + + // bare one body hamiltonian + mpi3RMatrix hij; + + // (potentially half rotated) one body hamiltonian + shmCMatrix haj; + + //Cholesky Tensor Lik[i][k][n] + shmSpRMatrix Likn; + + // permuted half-tranformed Cholesky tensor + // Lank[ 2*idet + ispin ] + std::vector Lank; + + // half-tranformed Cholesky tensor + // only used in single determinant case, haj.size(0)==1. + shmSpCMatrix Lakn; + + // one-body piece of Hamiltonian factorization + mpi3CMatrix vn0; + + // shared buffer space + // using matrix since there are issues with vectors + SpVector TBuff; + + myTimer Timer; + + void set_buffer(size_t N) { + if(TBuff.num_elements() < N) { + app_log()<<" Resizing buffer space in Real3IndexFactorization_batched to " <{N})); + } + memory_report(); + using std::fill_n; + fill_n(TBuff.origin(),N,SPComplexType(0.0)); + } + } + +}; + +} + +} + +#endif diff --git a/src/AFQMC/HamiltonianOperations/Real3IndexFactorization_batched_v2.hpp b/src/AFQMC/HamiltonianOperations/Real3IndexFactorization_batched_v2.hpp new file mode 100644 index 0000000000..0cb1ed289e --- /dev/null +++ b/src/AFQMC/HamiltonianOperations/Real3IndexFactorization_batched_v2.hpp @@ -0,0 +1,629 @@ +////////////////////////////////////////////////////////////////////// +// This file is distributed under the University of Illinois/NCSA Open Source +// License. See LICENSE file in top directory for details. +// +// Copyright (c) 2016 Jeongnim Kim and QMCPACK developers. +// +// File developed by: +// Miguel A. Morales, moralessilva2@llnl.gov +// Lawrence Livermore National Laboratory +// +// File created by: +// Miguel A. Morales, moralessilva2@llnl.gov +// Lawrence Livermore National Laboratory +//////////////////////////////////////////////////////////////////////////////// + +#ifndef QMCPLUSPLUS_AFQMC_HAMILTONIANOPERATIONS_REAL3INDEXFACTORIZATION_BATCHED_V2_HPP +#define QMCPLUSPLUS_AFQMC_HAMILTONIANOPERATIONS_REAL3INDEXFACTORIZATION_BATCHED_V2_HPP + +#include +#include +#include + +#include "Configuration.h" +#include "AFQMC/config.h" +#include "multi/array.hpp" +#include "multi/array_ref.hpp" +#include "AFQMC/Numerics/ma_operations.hpp" + +#include "AFQMC/Utilities/type_conversion.hpp" +#include "AFQMC/Utilities/taskgroup.h" +#include "AFQMC/Utilities/Utils.hpp" +#include "AFQMC/Numerics/helpers/batched_operations.hpp" +#include "AFQMC/Numerics/tensor_operations.hpp" + +namespace qmcplusplus +{ + +namespace afqmc +{ + +// Custom implementation for real build +class Real3IndexFactorization_batched_v2 +{ + + // allocators + using Allocator = device_allocator; + using SpAllocator = device_allocator; + using Allocator_shared = node_allocator; + using SpAllocator_shared = node_allocator; + using SpRAllocator_shared = node_allocator; + + // type defs + using pointer = typename Allocator::pointer; + using sp_pointer = typename SpAllocator::pointer; + using pointer_shared = typename Allocator_shared::pointer; + using sp_pointer_shared = typename SpAllocator_shared::pointer; + using sp_rpointer_shared = typename SpRAllocator_shared::pointer; + + using CVector = ComplexVector; + using SpVector = SPComplexVector; + using SpCMatrix = SPComplexMatrix; + using CVector_ref = ComplexVector_ref; + using SpCVector_ref = SPComplexVector_ref; + using CMatrix_ref = ComplexMatrix_ref; + using SpCMatrix_ref = SPComplexMatrix_ref; + using SpC4Tensor_ref = boost::multi::array_ref; + + using shmCMatrix = ComplexMatrix; + using shmSpC3Tensor = SPComplex3Tensor; + using shmSpCMatrix = SPComplexMatrix; + using shmSpRMatrix = boost::multi::array; + + using mpi3RMatrix = boost::multi::array>; + using mpi3CMatrix = boost::multi::array>; + + public: + + template + Real3IndexFactorization_batched_v2(WALKER_TYPES type, + mpi3RMatrix&& hij_, + shmCMatrix_&& haj_, + shmSpRMatrix_&& vik, + std::vector&& vnak, + mpi3CMatrix&& vn0_, + ValueType e0_, + Allocator const& alloc_, + int cv0, + int gncv, + long maxMem = 2000): + allocator_(alloc_), + sp_allocator_(alloc_), + walker_type(type), + max_memory_MB(maxMem), + global_origin(cv0), + global_nCV(gncv), + local_nCV(0), + E0(e0_), + hij(std::move(hij_)), + haj(std::move(haj_)), + Likn(std::move(vik)), + Lnak(std::move(move_vector(std::move(vnak)))), + vn0(std::move(vn0_)), + TBuff(iextensions<1u>{1},sp_allocator_) + { + local_nCV=Likn.size(1); + size_t lnak(0); + for(auto& v: Lnak) lnak += v.num_elements(); + app_log()<<"****************************************************************** \n" + <<" Static memory usage by Real3IndexFactorization_batched_v2 (node 0 in MB) \n" + <<" Likn: " < getOneBodyPropagatorMatrix(TaskGroup_& TG, boost::multi::array const& vMF) + { + int NMO = hij.size(0); + // in non-collinear case with SO, keep SO matrix here and add it + // for now, stay collinear + + CVector vMF_(vMF); + CVector P1D(iextensions<1u>{NMO*NMO}); + fill_n(P1D.origin(),P1D.num_elements(),ComplexType(0)); + vHS(vMF_, P1D); + if(TG.TG().size() > 1) + TG.TG().all_reduce_in_place_n(to_address(P1D.origin()),P1D.num_elements(),std::plus<>()); + + boost::multi::array P1({NMO,NMO}); + copy_n(P1D.origin(),NMO*NMO,P1.origin()); + + using ma::conj; + + for(int i=0; i 1e-6 ) { + app_error()<<" WARNING in getOneBodyPropagatorMatrix. P1 is not hermitian. \n"; + app_error()< + void energy(Mat&& E, MatB const& G, int k, bool addH1=true, bool addEJ=true, bool addEXX=true) { + MatB* Kr(nullptr); + MatB* Kl(nullptr); + energy(E,G,k,Kl,Kr,addH1,addEJ,addEXX); + } + + // KEleft and KEright must be in shared memory for this to work correctly + template + void energy(Mat&& E, MatB const& Gc, int nd, MatC* KEleft, MatD* KEright, bool addH1=true, bool addEJ=true, bool addEXX=true) { + assert(E.size(1)>=3); + assert(nd >= 0); + assert(nd < haj.size()); + if(walker_type==COLLINEAR) + assert(2*nd+1 < Lnak.size()); + else + assert(nd < Lnak.size()); + + int nwalk = Gc.size(0); + int nspin = (walker_type==COLLINEAR?2:1); + int NMO = hij.size(0); + int nel[2]; + nel[0] = Lnak[nspin*nd].size(1); + nel[1] = ((nspin==2)?Lnak[nspin*nd+1].size(1):0); + assert(Lnak[nspin*nd].size(0) == local_nCV); + assert(Lnak[nspin*nd].size(2) == NMO); + if(nspin==2) { + assert(Lnak[nspin*nd+1].size(0) == local_nCV); + assert(Lnak[nspin*nd+1].size(2) == NMO); + } + assert(Gc.num_elements() == nwalk*(nel[0]+nel[1])*NMO); + + int getKr = KEright!=nullptr; + int getKl = KEleft!=nullptr; + if(E.size(0) != nwalk || E.size(1) < 3) + APP_ABORT(" Error in AFQMC/HamiltonianOperations/Real3IndexFactorization_batched_v2::energy(...). Incorrect matrix dimensions \n"); + + // T[nwalk][nup][nup][local_nCV] + D[nwalk][nwalk][local_nCV] + size_t mem_needs(0); + size_t cnt(0); + if(addEJ) { +#if MIXED_PRECISION + mem_needs += nwalk*local_nCV; +#else + if(not getKl) mem_needs += nwalk*local_nCV; +#endif + } + if(addEXX) { +#if MIXED_PRECISION + mem_needs += nwalk*nel[0]*NMO; +#else + if(nspin == 2) mem_needs += nwalk*nel[0]*NMO; +#endif + } + int max_nCV = 0; + if(addEXX){ + long LBytes = max_memory_MB*1024L*1024L - mem_needs*sizeof(SPComplexType); + int Bytes = int(LBytes / long(nwalk*nel[0]*nel[0]*sizeof(SPComplexType))); + max_nCV = std::min( std::max(1, Bytes), local_nCV); + assert(max_nCV > 1 && max_nCV <= local_nCV); + mem_needs += long(max_nCV*nwalk*nel[0]*nel[0]); + } + set_buffer(mem_needs); + + // messy + sp_pointer Klptr(nullptr); + long Knr=0, Knc=0; + if(addEJ) { + Knr=nwalk; + Knc=local_nCV; + if(getKr) { + assert(KEright->size(0) == nwalk && KEright->size(1) == local_nCV); + assert(KEright->stride(0) == KEright->size(1)); + } +#if MIXED_PRECISION + if(getKl) { + assert(KEleft->size(0) == nwalk && KEleft->size(1) == local_nCV); + assert(KEleft->stride(0) == KEleft->size(1)); + } +#else + if(getKl) { + assert(KEleft->size(0) == nwalk && KEleft->size(1) == local_nCV); + assert(KEleft->stride(0) == KEleft->size(1)); + Klptr = make_device_ptr(KEleft->origin()); + } else +#endif + { + Klptr = TBuff.origin()+cnt; + cnt += Knr*Knc; + } + fill_n(Klptr,Knr*Knc,SPComplexType(0.0)); + } else if(getKr or getKl) { + APP_ABORT(" Error: Kr and/or Kl can only be calculated with addEJ=true.\n"); + } + SpCMatrix_ref Kl(Klptr,{long(Knr),long(Knc)}); + + for(int n=0; n{haj[nd].num_elements()}); + ma::product(ComplexType(1.),Gc,haj_ref,ComplexType(1.),E(E.extension(0),0)); + for(int i=0; i(scl*scl*ma::dot(Kl[n],Kl[n])); +#if MIXED_PRECISION + if(getKl) + copy_n_cast(Klptr,KEleft->num_elements(),make_device_ptr(KEleft->origin())); +#endif + if(getKr) + copy_n_cast(Klptr,KEright->num_elements(),make_device_ptr(KEright->origin())); + } + } + + template + void fast_energy(Args&&... args) + { + APP_ABORT(" Error: fast_energy not implemented in Real3IndexFactorization_batched_v2. \n"); + } + + template::type::dimensionality==1)>, + typename = typename std::enable_if_t<(std::decay::type::dimensionality==1)>, + typename = void + > + void vHS(MatA& X, MatB&& v, double a=1., double c=0.) { + assert( Likn.size(1) == X.size(0) ); + assert( Likn.size(0) == v.size(0) ); +#if MIXED_PRECISION + size_t mem_needs = X.num_elements()+v.num_elements(); + set_buffer(mem_needs); + SpCVector_ref vsp(TBuff.origin(), v.extensions()); + SpCVector_ref Xsp(vsp.origin()+vsp.num_elements(), X.extensions()); + copy_n_cast(make_device_ptr(X.origin()),X.num_elements(),Xsp.origin()); + if( std::abs(c-0.0) > 1e-6 ) + copy_n_cast(make_device_ptr(v.origin()),v.num_elements(),vsp.origin()); + ma::product(SPValueType(a),Likn,Xsp,SPValueType(c),vsp); + copy_n_cast(vsp.origin(),vsp.num_elements(),make_device_ptr(v.origin())); +#else + ma::product(SPValueType(a),Likn,X,SPValueType(c),v); +#endif + } + + template::type::dimensionality==2)>, + typename = typename std::enable_if_t<(std::decay::type::dimensionality==2)> + > + void vHS(MatA& X, MatB&& v, double a=1., double c=0.) { + assert( Likn.size(1) == X.size(0) ); + assert( Likn.size(0) == v.size(0) ); + assert( X.size(1) == v.size(1) ); +#if MIXED_PRECISION + size_t mem_needs = X.num_elements()+v.num_elements(); + set_buffer(mem_needs); + SpCMatrix_ref vsp(TBuff.origin(), v.extensions()); + SpCMatrix_ref Xsp(vsp.origin()+vsp.num_elements(), X.extensions()); + copy_n_cast(make_device_ptr(X.origin()),X.num_elements(),Xsp.origin()); + if( std::abs(c-0.0) > 1e-6 ) + copy_n_cast(make_device_ptr(v.origin()),v.num_elements(),vsp.origin()); + ma::product(SPValueType(a),Likn,Xsp,SPValueType(c),vsp); + copy_n_cast(vsp.origin(),vsp.num_elements(),make_device_ptr(v.origin())); +#else + ma::product(SPValueType(a),Likn,X,SPValueType(c),v); +#endif + } + + template::type::dimensionality==1)>, + typename = typename std::enable_if_t<(std::decay::type::dimensionality==1)>, + typename = void + > + void vbias(const MatA& G, MatB&& v, double a=1., double c=0., int k=0) { + if(walker_type==CLOSED) a*=2.0; + if(haj.size(0) == 1) { + if(walker_type==COLLINEAR) { + int NMO, nel[2]; + NMO = Lnak[0].size(2); + nel[0] = Lnak[0].size(1); + nel[1] = Lnak[1].size(1); + double c_[2]; + c_[0] = c; + c_[1] = c; + if( std::abs(c-0.0) < 1e-8 ) c_[1] = 1.0; + size_t mem_needs = (nel[0]*NMO)+v.num_elements(); + set_buffer(mem_needs); + SpCVector_ref vsp(TBuff.origin(), v.extensions()); + if( std::abs(c-0.0) > 1e-6 ) + copy_n_cast(make_device_ptr(v.origin()),v.num_elements(),vsp.origin()); + for(int ispin=0, is0=0; ispin<2; ispin++) { + assert( Lnak[ispin].size(0) == v.size(0) ); + assert( Lnak[ispin].size(1) == G.size(0) ); + SpCMatrix_ref Ln(make_device_ptr(Lnak[ispin].origin()), {local_nCV,nel[ispin]*NMO}); +#if MIXED_PRECISION + SpCVector_ref Gsp(vsp.origin()+vsp.num_elements(), {nel[ispin]*NMO}); + copy_n_cast(make_device_ptr(G.origin())+is0,Gsp.num_elements(),Gsp.origin()); + ma::product(SPComplexType(a),Ln,Gsp,SPComplexType(c_[ispin]),vsp); +#else + ma::product(SPComplexType(a),Ln,G.sliced(is0,is0+nel[ispin]*NMO),SPComplexType(c_[ispin]),v); +#endif + is0 += nel[ispin]*NMO; + } +#if MIXED_PRECISION + copy_n_cast(vsp.origin(),vsp.num_elements(),make_device_ptr(v.origin())); +#endif + } else { + assert( G.size(0) == v.size(1) ); + assert( Lnak[0].size(1) == G.size(1) ); + assert( Lnak[0].size(0) == v.size(0) ); + SpCMatrix_ref Ln(make_device_ptr(Lnak[0].origin()), {local_nCV,Lnak[0].size(1)*Lnak[0].size(2)}); +#if MIXED_PRECISION + size_t mem_needs = G.num_elements()+v.num_elements(); + set_buffer(mem_needs); + SpCVector_ref vsp(TBuff.origin(), v.extensions()); + SpCVector_ref Gsp(vsp.origin()+vsp.num_elements(), G.extensions()); + copy_n_cast(make_device_ptr(G.origin()),G.num_elements(),Gsp.origin()); + if( std::abs(c-0.0) > 1e-6 ) + copy_n_cast(make_device_ptr(v.origin()),v.num_elements(),vsp.origin()); + ma::product(SPComplexType(a),Ln,Gsp,SPComplexType(c),vsp); + copy_n_cast(vsp.origin(),vsp.num_elements(),make_device_ptr(v.origin())); +#else + ma::product(SPComplexType(a),Ln,G,SPComplexType(c),v); +#endif + } + } else { + // multideterminant is not half-rotated, so use Likn + assert( Likn.size(0) == G.size(0) ); + assert( Likn.size(1) == v.size(0) ); + +#if MIXED_PRECISION + size_t mem_needs = G.num_elements()+v.num_elements(); + set_buffer(mem_needs); + SpCVector_ref vsp(TBuff.origin(), v.extensions()); + SpCVector_ref Gsp(vsp.origin()+vsp.num_elements(), G.extensions()); + copy_n_cast(make_device_ptr(G.origin()),G.num_elements(),Gsp.origin()); + ma::product(SPValueType(a),ma::T(Likn),Gsp,SPValueType(c),vsp); + copy_n_cast(vsp.origin(),vsp.num_elements(),make_device_ptr(v.origin())); +#else + ma::product(SPValueType(a),ma::T(Likn),G,SPValueType(c),v); +#endif + } + } + + // v(n,w) = sum_ak L(ak,n) G(w,ak) + template::type::dimensionality==2)>, + typename = typename std::enable_if_t<(std::decay::type::dimensionality==2)> + > + void vbias(const MatA& G, MatB&& v, double a=1., double c=0., int k=0) { + if(walker_type==CLOSED) a*=2.0; + if(haj.size(0) == 1) { + int nwalk = v.size(1); + if(walker_type==COLLINEAR) { + assert( G.size(1) == v.size(1) ); + int NMO, nel[2]; + NMO = Lnak[0].size(2); + nel[0] = Lnak[0].size(1); + nel[1] = Lnak[1].size(1); + double c_[2]; + c_[0] = c; + c_[1] = c; + if( std::abs(c-0.0) < 1e-8 ) c_[1] = 1.0; + size_t mem_needs = (nwalk*nel[0]*NMO)+v.num_elements(); + set_buffer(mem_needs); + SpCMatrix_ref vsp(TBuff.origin(), v.extensions()); + if( std::abs(c-0.0) > 1e-6 ) + copy_n_cast(make_device_ptr(v.origin()),v.num_elements(),vsp.origin()); + for(int ispin=0, is0=0; ispin<2; ispin++) { + assert( Lnak[ispin].size(0) == v.size(0) ); + assert( Lnak[ispin].size(1) == G.size(0) ); + SpCMatrix_ref Ln(make_device_ptr(Lnak[ispin].origin()), {local_nCV,nel[ispin]*NMO}); +#if MIXED_PRECISION + SpCMatrix_ref Gsp(vsp.origin()+vsp.num_elements(), {nel[ispin]*NMO,nwalk}); + copy_n_cast(make_device_ptr(G.origin())+is0*nwalk,Gsp.num_elements(),Gsp.origin()); + ma::product(SPComplexType(a),Ln,Gsp,SPComplexType(c_[ispin]),vsp); +#else + ma::product(SPComplexType(a),Ln,G.sliced(is0,is0+nel[ispin]*NMO),SPComplexType(c_[ispin]),v); +#endif + is0 += nel[ispin]*NMO; + } +#if MIXED_PRECISION + copy_n_cast(vsp.origin(),vsp.num_elements(),make_device_ptr(v.origin())); +#endif + } else { + assert( G.size(0) == v.size(1) ); + assert( Lnak[0].size(1) == G.size(1) ); + assert( Lnak[0].size(0) == v.size(0) ); + SpCMatrix_ref Ln(make_device_ptr(Lnak[0].origin()), {local_nCV,Lnak[0].size(1)*Lnak[0].size(2)}); +#if MIXED_PRECISION + size_t mem_needs = G.num_elements()+v.num_elements(); + set_buffer(mem_needs); + SpCMatrix_ref vsp(TBuff.origin(), v.extensions()); + SpCMatrix_ref Gsp(vsp.origin()+vsp.num_elements(), G.extensions()); + copy_n_cast(make_device_ptr(G.origin()),G.num_elements(),Gsp.origin()); + if( std::abs(c-0.0) > 1e-6 ) + copy_n_cast(make_device_ptr(v.origin()),v.num_elements(),vsp.origin()); + ma::product(SPComplexType(a),Ln,ma::T(Gsp),SPComplexType(c),vsp); + copy_n_cast(vsp.origin(),vsp.num_elements(),make_device_ptr(v.origin())); +#else + ma::product(SPComplexType(a),Ln,ma::T(G),SPComplexType(c),v); +#endif + } + } else { + // multideterminant is not half-rotated, so use Likn + assert( Likn.size(0) == G.size(0) ); + assert( Likn.size(1) == v.size(0) ); + assert( G.size(1) == v.size(1) ); + +#if MIXED_PRECISION + size_t mem_needs = G.num_elements()+v.num_elements(); + set_buffer(mem_needs); + SpCMatrix_ref vsp(TBuff.origin(), v.extensions()); + SpCMatrix_ref Gsp(vsp.origin()+vsp.num_elements(), G.extensions()); + copy_n_cast(make_device_ptr(G.origin()),G.num_elements(),Gsp.origin()); + ma::product(SPValueType(a),ma::T(Likn),Gsp,SPValueType(c),vsp); + copy_n_cast(vsp.origin(),vsp.num_elements(),make_device_ptr(v.origin())); +#else + ma::product(SPValueType(a),ma::T(Likn),G,SPValueType(c),v); +#endif + } + } + + bool distribution_over_cholesky_vectors() const{ return true; } + int number_of_ke_vectors() const{ return local_nCV; } + int local_number_of_cholesky_vectors() const{ return local_nCV; } + int global_number_of_cholesky_vectors() const{ return global_nCV; } + int global_origin_cholesky_vector() const{ return global_origin; } + + // transpose=true means G[nwalk][ik], false means G[ik][nwalk] + bool transposed_G_for_vbias() const{ + return ((haj.size(0) == 1) && (walker_type!=COLLINEAR)); + } + bool transposed_G_for_E() const{return true;} + // transpose=true means vHS[nwalk][ik], false means vHS[ik][nwalk] + bool transposed_vHS() const{return false;} + + bool fast_ph_energy() const { return false; } + + boost::multi::array getHSPotentials() + { + return boost::multi::array{}; + } + + private: + + Allocator allocator_; + SpAllocator sp_allocator_; + + WALKER_TYPES walker_type; + + long max_memory_MB; + int global_origin; + int global_nCV; + int local_nCV; + + ValueType E0; + + // bare one body hamiltonian + mpi3RMatrix hij; + + // (potentially half rotated) one body hamiltonian + shmCMatrix haj; + + //Cholesky Tensor Lik[i][k][n] + shmSpRMatrix Likn; + + // permuted half-tranformed Cholesky tensor + // Lnak[ 2*idet + ispin ] + std::vector Lnak; + + // one-body piece of Hamiltonian factorization + mpi3CMatrix vn0; + + // shared buffer space + // using matrix since there are issues with vectors + SpVector TBuff; + + myTimer Timer; + + void set_buffer(size_t N) { + if(TBuff.num_elements() < N) { + app_log()<<" Resizing buffer space in Real3IndexFactorization_batched_v2 to " <{N})); + } + memory_report(); + using std::fill_n; + fill_n(TBuff.origin(),N,SPComplexType(0.0)); + } + } + +}; + +} + +} + +#endif diff --git a/src/AFQMC/Hamiltonians/Hamiltonian.hpp b/src/AFQMC/Hamiltonians/Hamiltonian.hpp index c380bccd54..20eb8e2e98 100644 --- a/src/AFQMC/Hamiltonians/Hamiltonian.hpp +++ b/src/AFQMC/Hamiltonians/Hamiltonian.hpp @@ -26,6 +26,9 @@ #ifdef QMC_COMPLEX #include "AFQMC/Hamiltonians/KPFactorizedHamiltonian.h" //#include "AFQMC/Hamiltonians/KPTHCHamiltonian.h" +#else +#include "AFQMC/Hamiltonians/RealDenseHamiltonian.h" +#include "AFQMC/Hamiltonians/RealDenseHamiltonian_v2.h" #endif #include "AFQMC/HamiltonianOperations/HamiltonianOperations.hpp" @@ -107,7 +110,9 @@ class Hamiltonian: public boost::variant #endif { @@ -124,6 +129,9 @@ class Hamiltonian: public boost::variant 0) { - // nothing to do, H1 is read during construction of HamiltonianOperations object. - } else + if(htype == KPFactorized || htype == KPTHC) { +#else + if(htype == RealDenseFactorized) { #endif - if(dump.readEntry(H1,"hcore")) { - foundH1 = true; + // nothing to do, H1 is read during construction of HamiltonianOperations object. } else { + if(dump.readEntry(H1,"hcore")) { + foundH1 = true; + } else { + + H1.reextent({NMO,NMO}); + if(Idata[0] < 1) { + app_error()<<" Error in HamiltonianFactory::fromHDF5(): Dimensions of H1 < 1. \n"; + APP_ABORT(" "); + } - H1.reextent({NMO,NMO}); - if(Idata[0] < 1) { - app_error()<<" Error in HamiltonianFactory::fromHDF5(): Dimensions of H1 < 1. \n"; - APP_ABORT(" "); - } - - std::vector ivec(2*Idata[0]); - if(!dump.readEntry(ivec,"H1_indx")) { - app_error()<<" Error in HamiltonianFactory::fromHDF5(): Problems reading H1_indx. \n"; - APP_ABORT(" "); - } - std::vector vvec(Idata[0]); - if(!dump.readEntry(vvec,"H1")) { - app_error()<<" Error in HamiltonianFactory::fromHDF5(): Problems reading H1. \n"; - APP_ABORT(" "); - } + std::vector ivec(2*Idata[0]); + if(!dump.readEntry(ivec,"H1_indx")) { + app_error()<<" Error in HamiltonianFactory::fromHDF5(): Problems reading H1_indx. \n"; + APP_ABORT(" "); + } + std::vector vvec(Idata[0]); + if(!dump.readEntry(vvec,"H1")) { + app_error()<<" Error in HamiltonianFactory::fromHDF5(): Problems reading H1. \n"; + APP_ABORT(" "); + } - for(int i=0; i=0) - LQKank.emplace_back(shmSpMatrix({nkpts,ank_max},shared_allocator{TG.Node()})); + LQKank.emplace_back(shmSpMatrix({nkpts,ank_max},shared_allocator{TG.Node()})); else - LQKank.emplace_back(shmSpMatrix({1,1},shared_allocator{TG.Node()})); + LQKank.emplace_back(shmSpMatrix({1,1},shared_allocator{TG.Node()})); if(type==COLLINEAR) { for(int Q=0; Q=0) - LQKank.emplace_back(shmSpMatrix({nkpts,ank_max},shared_allocator{TG.Node()})); + LQKank.emplace_back(shmSpMatrix({nkpts,ank_max},shared_allocator{TG.Node()})); else - LQKank.emplace_back(shmSpMatrix({1,1},shared_allocator{TG.Node()})); + LQKank.emplace_back(shmSpMatrix({1,1},shared_allocator{TG.Node()})); } } for(int nd=0, nt=0, nq0=0; nd{TG.Node()})); + LQKbnl.emplace_back(shmSpMatrix({nkpts,ank_max},shared_allocator{TG.Node()})); if(type==COLLINEAR) { for(int Q=0; Q{TG.Node()})); + LQKbnl.emplace_back(shmSpMatrix({nkpts,ank_max},shared_allocator{TG.Node()})); } } for(int nd=0, nt=0, nq0=0; nd{TG.Node()}); - if(TG.Node().root()) std::fill_n(haj.origin(),haj.num_elements(),ComplexType(0.0)); + if(TG.Node().root()) std::fill_n(to_address(haj.origin()),haj.num_elements(),ComplexType(0.0)); int ank_max = nocc_max*nchol_max*nmo_max; for(int nd=0; nd=0) - LQKank.emplace_back(shmSpMatrix({nkpts,ank_max},shared_allocator{distNode})); + LQKank.emplace_back(shmSpMatrix({nkpts,ank_max},shared_allocator{distNode})); else - LQKank.emplace_back(shmSpMatrix({1,1},shared_allocator{distNode})); + LQKank.emplace_back(shmSpMatrix({1,1},shared_allocator{distNode})); if(type==COLLINEAR) { for(int Q=0; Q=0) - LQKank.emplace_back(shmSpMatrix({nkpts,ank_max},shared_allocator{distNode})); + LQKank.emplace_back(shmSpMatrix({nkpts,ank_max},shared_allocator{distNode})); else - LQKank.emplace_back(shmSpMatrix({1,1},shared_allocator{distNode})); + LQKank.emplace_back(shmSpMatrix({1,1},shared_allocator{distNode})); } } - if(distNode.root()) for( auto& v: LQKank ) fill_n(v.origin(),v.num_elements(),SPComplexType(0.0)); + if(distNode.root()) for( auto& v: LQKank ) std::fill_n(to_address(v.origin()),v.num_elements(),SPComplexType(0.0)); std::vector LQKakn; LQKakn.reserve(ndet*nspins*nkpts); for(int nd=0; nd=0) - LQKakn.emplace_back(shmSpMatrix({nkpts,ank_max},shared_allocator{distNode})); + LQKakn.emplace_back(shmSpMatrix({nkpts,ank_max},shared_allocator{distNode})); else - LQKakn.emplace_back(shmSpMatrix({1,1},shared_allocator{distNode})); + LQKakn.emplace_back(shmSpMatrix({1,1},shared_allocator{distNode})); } if(type==COLLINEAR) { for(int Q=0; Q=0) - LQKakn.emplace_back(shmSpMatrix({nkpts,ank_max},shared_allocator{distNode})); + LQKakn.emplace_back(shmSpMatrix({nkpts,ank_max},shared_allocator{distNode})); else - LQKakn.emplace_back(shmSpMatrix({1,1},shared_allocator{distNode})); + LQKakn.emplace_back(shmSpMatrix({1,1},shared_allocator{distNode})); } } } - if(distNode.root()) for( auto& v: LQKakn ) fill_n(v.origin(),v.num_elements(),SPComplexType(0.0)); - + if(distNode.root()) for( auto& v: LQKakn ) std::fill_n(to_address(v.origin()),v.num_elements(),SPComplexType(0.0)); // NOTE: LQKbnl and LQKbln are indexed by the K index of 'b', L[Q][Kb] std::vector LQKbnl; LQKbnl.reserve(ndet*nspins*number_of_symmetric_Q); // storing 2 components for Q=0, since it is not assumed symmetric for(int nd=0; nd{distNode})); - } + for(int Q=0; Q{distNode})); if(type==COLLINEAR) { - for(int Q=0; Q{distNode})); - } + for(int Q=0; Q{distNode})); } } - if(distNode.root()) for( auto& v: LQKbnl ) fill_n(v.origin(),v.num_elements(),SPComplexType(0.0)); + if(distNode.root()) for( auto& v: LQKbnl ) std::fill_n(to_address(v.origin()),v.num_elements(),SPComplexType(0.0)); std::vector LQKbln; LQKbln.reserve(ndet*nspins*number_of_symmetric_Q); // storing 2 components for Q=0, since it is not assumed symmetric for(int nd=0; nd{distNode})); + LQKbln.emplace_back(shmSpMatrix({nkpts,ank_max},shared_allocator{distNode})); } if(type==COLLINEAR) { for(int Q=0; Q{distNode})); + LQKbln.emplace_back(shmSpMatrix({nkpts,ank_max},shared_allocator{distNode})); } } } - if(distNode.root()) for( auto& v: LQKbln ) fill_n(v.origin(),v.num_elements(),SPComplexType(0.0)); + if(distNode.root()) for( auto& v: LQKbln ) std::fill_n(to_address(v.origin()),v.num_elements(),SPComplexType(0.0)); int Q0=-1; // if K=(0,0,0) exists, store index here for(int Q=0; Q& PsiT, double cutvn, double cutv2, TaskGroup_& TGprop, TaskGroup_& TGwfn, hdf_archive& dump); diff --git a/src/AFQMC/Hamiltonians/RealDenseHamiltonian.cpp b/src/AFQMC/Hamiltonians/RealDenseHamiltonian.cpp new file mode 100644 index 0000000000..2c264231e8 --- /dev/null +++ b/src/AFQMC/Hamiltonians/RealDenseHamiltonian.cpp @@ -0,0 +1,264 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#if defined(USE_MPI) +#include +#endif + +#include "Configuration.h" +#include "type_traits/container_traits_multi.h" +#include "io/hdf_multi.h" +#include "io/hdf_archive.h" + +#include "AFQMC/config.h" +#include "AFQMC/Utilities/Utils.hpp" +#include "AFQMC/Utilities/kp_utilities.hpp" +#include "AFQMC/Hamiltonians/RealDenseHamiltonian.h" +#include "AFQMC/SlaterDeterminantOperations/rotate.hpp" + +namespace qmcplusplus +{ + +namespace afqmc +{ + +#ifndef QMC_COMPLEX + +HamiltonianOperations RealDenseHamiltonian::getHamiltonianOperations(bool pureSD, + bool addCoulomb, WALKER_TYPES type, std::vector& PsiT, + double cutvn, double cutv2, TaskGroup_& TGprop, TaskGroup_& TGwfn, + hdf_archive& hdf_restart) { + + using shmIMatrix = boost::multi::array>; + using shmCVector = boost::multi::array>; + using shmCMatrix = boost::multi::array>; + using shmRMatrix = boost::multi::array>; + using shmCTensor = boost::multi::array>; + using shmSpMatrix = boost::multi::array>; + using shmSpRMatrix = boost::multi::array>; + using shmSp3Tensor = boost::multi::array>; + using IVector = boost::multi::array; + using CMatrix = boost::multi::array; + using SpMatrix = boost::multi::array; + using SpVector_ref = boost::multi::array_ref; + using SpMatrix_ref = boost::multi::array_ref; + using SpRMatrix = boost::multi::array; + using SpRMatrix_ref = boost::multi::array_ref; + using CMatrix_ref = boost::multi::array_ref; + using RMatrix_ref = boost::multi::array_ref; + using Sp3Tensor_ref = boost::multi::array_ref; + + if(type==COLLINEAR) + assert(PsiT.size()%2 == 0); + int nspins = ((type!=COLLINEAR)?1:2); + int ndet = PsiT.size()/nspins; + int nup = PsiT[0].size(0); + int ndown = 0; + if(nspins == 2) ndown = PsiT[1].size(0); + int NEL = nup+ndown; + + hdf_archive dump(TG.Global()); + // right now only Node.root() reads + if( TG.Node().root() ) { + if(!dump.open(fileName,H5F_ACC_RDONLY)) { + app_error()<<" Error opening integral file in THCHamiltonian. \n"; + APP_ABORT(""); + } + if(!dump.push("Hamiltonian",false)) { + app_error()<<" Error in THCHamiltonian::getHamiltonianOperations():" + <<" Group not Hamiltonian found. \n"; + APP_ABORT(""); + } + } + + std::vector Idata(8); + if( TG.Global().root() ) { + if(!dump.readEntry(Idata,"dims")) { + app_error()<<" Error in RealDenseHamiltonian::getHamiltonianOperations():" + <<" Problems reading dims. \n"; + APP_ABORT(""); + } + } + TG.Global().broadcast_n(Idata.begin(),8,0); + + ValueType E0; + if( TG.Global().root() ) { + std::vector E_(2); + if(!dump.readEntry(E_,"Energies")) { + app_error()<<" Error in RealDenseHamiltonian::getHamiltonianOperations():" + <<" Problems reading Energies. \n"; + APP_ABORT(""); + } + E0 = E_[0]+E_[1]; + } + TG.Global().broadcast_n(&E0,1,0); + + int global_ncvecs = Idata[7]; + int nc0, ncN; + int node_number = TGprop.getLocalGroupNumber(); + int nnodes_prt_TG = TGprop.getNGroupsPerTG(); + std::tie(nc0,ncN) = FairDivideBoundary(node_number,global_ncvecs,nnodes_prt_TG); + int local_ncv = ncN-nc0; + + shmRMatrix H1({NMO,NMO},shared_allocator{TG.Node()}); + shmSpRMatrix Likn({NMO*NMO,local_ncv}, shared_allocator{TG.Node()}); + + if( TG.Node().root() ) { + // now read H1, use ref to avoid issues with shared pointers! + RMatrix_ref h1_(to_address(H1.origin()),H1.extensions()); + if(!dump.readEntry(h1_,std::string("hcore"))) { + app_error()<<" Error in RealDenseHamiltonian::getHamiltonianOperations():" + <<" Problems reading /Hamiltonian/hcore. \n"; + APP_ABORT(""); + } + // read L + if(!dump.push("DenseFactorized",false)) { + app_error()<<" Error in RealDenseHamiltonian::getHamiltonianOperations():" + <<" Group DenseFactorized not found. \n"; + APP_ABORT(""); + } + SpRMatrix_ref L(to_address(Likn.origin()),Likn.extensions()); + hyperslab_proxy hslab(L,std::array{NMO*NMO,global_ncvecs}, + std::array{NMO*NMO,local_ncv}, + std::array{0,nc0}); + if(!dump.readEntry(hslab,std::string("L"))) { + app_error()<<" Error in RealDenseHamiltonian::getHamiltonianOperations():" + <<" Problems reading /Hamiltonian/DenseFactorized/L. \n"; + APP_ABORT(""); + } + if(Likn.size(0) != NMO*NMO || Likn.size(1) != local_ncv) { + app_error()<<" Error in RealDenseHamiltonian::getHamiltonianOperations():" + <<" Problems reading /Hamiltonian/DenseFactorized/L. \n" + <<" Unexpected dimensins: " <{TG.Node()}); + shmCMatrix haj({ndet,NEL*NMO},shared_allocator{TG.Node()}); + std::vector Lank; + Lank.reserve(PsiT.size()); + for(int nd=0; nd{TG.Node()})); + int nrow = NEL; + if(ndet > 1) nrow = 0; // not used if ndet>1 + shmSpMatrix Lakn({nrow*NMO,local_ncv},shared_allocator{TG.Node()}); + TG.Node().barrier(); + + // for simplicity + CMatrix H1C({NMO,NMO}); + copy_n_cast(to_address(H1.origin()),NMO*NMO,H1C.origin()); + + int nt=0; + for(int nd=0; nd0) ma::product(PsiT[2*nd+1],H1C,hbj_r); + } else { + CMatrix_ref haj_r(to_address(haj[nd].origin()),{nup,NMO}); + ma::product(ComplexType(2.0),PsiT[nd],H1C,ComplexType(0.0),haj_r); + } + } + // Generate Lank + // distribute work over equivalent nodes in TGprop.TG() accross TG.Global() + auto Qcomm(TG.Global().split(TGprop.getLocalGroupNumber(),TG.Global().rank())); + auto Qcomm_roots(Qcomm.split(TGprop.Node().rank(),Qcomm.rank())); + { + CMatrix lik({NMO,NMO}); + CMatrix lak({nup,NMO}); + for(int nd=0; nd(Likn[ik][nc]),0.0); + ma::product(PsiT[nspins*nd], lik, lak); + for(int a=0; a(lak[a][k]); + if(type==COLLINEAR) { + ma::product(PsiT[2*nd+1], lik, lak.sliced(0,ndown)); + for(int a=0; a(lak[a][k]); + } + } + } + } + TG.Global().barrier(); + if(TG.Node().root()) { + for(auto& v:Lank) + Qcomm_roots.all_reduce_in_place_n(to_address(v.origin()),v.num_elements(),std::plus<>()); + if(ndet==1) + Qcomm_roots.all_reduce_in_place_n(to_address(Lakn.origin()), + Lakn.num_elements(),std::plus<>()); + std::fill_n(to_address(vn0.origin()),vn0.num_elements(),ComplexType(0.0)); + } + TG.Node().barrier(); + + // calculate vn0(i,l) = -0.5 sum_j sum_n L[i][j][n] L[j][l][n] = -0.5 sum_j sum_n L[i][j][n] L[l][j][n] + { + int i0,iN; + std::tie(i0,iN) = FairDivideBoundary(Qcomm.rank(),NMO,Qcomm.size()); + if(iN>i0) { + SpRMatrix v_({iN-i0,NMO}); + SpRMatrix_ref Lijn(to_address(Likn.origin()),{NMO,NMO*local_ncv}); + ma::product(-0.5,Lijn.sliced(i0,iN),ma::T(Lijn),0.0,v_); + copy_n_cast( v_.origin(), v_.num_elements(), to_address(vn0[i0].origin())); + } + } + TG.Node().barrier(); + + if( TG.Node().root() ) { + Qcomm_roots.all_reduce_in_place_n(to_address(vn0.origin()), + vn0.num_elements(),std::plus<>()); + dump.pop(); + dump.close(); + } + + if(TG.TG_local().size() > 1 || not (batched=="yes" || batched == "true" )) + return HamiltonianOperations(Real3IndexFactorization(TGwfn,type,std::move(H1),std::move(haj), + std::move(Likn),std::move(Lakn),std::move(Lank),std::move(vn0),E0,nc0,global_ncvecs)); + else + return HamiltonianOperations(Real3IndexFactorization_batched(type,std::move(H1),std::move(haj), + std::move(Likn),std::move(Lakn),std::move(Lank),std::move(vn0),E0,device_allocator{}, + nc0,global_ncvecs)); + +} + +#else + +HamiltonianOperations RealDenseHamiltonian::getHamiltonianOperations(bool pureSD, + bool addCoulomb, WALKER_TYPES type, std::vector& PsiT, + double cutvn, double cutv2, TaskGroup_& TGprop, TaskGroup_& TGwfn, + hdf_archive& hdf_restart) { + APP_ABORT(" Error: RealDenseHamiltonian requires complex build (QMC_COMPLEX=1). \n"); + return HamiltonianOperations(); +} + + +#endif + +} +} diff --git a/src/AFQMC/Hamiltonians/RealDenseHamiltonian.h b/src/AFQMC/Hamiltonians/RealDenseHamiltonian.h new file mode 100644 index 0000000000..323cce06f1 --- /dev/null +++ b/src/AFQMC/Hamiltonians/RealDenseHamiltonian.h @@ -0,0 +1,98 @@ + +#ifndef QMCPLUSPLUS_AFQMC_REALDENSEHAMILTONIAN_H +#define QMCPLUSPLUS_AFQMC_REALDENSEHAMILTONIAN_H + +#include +#include +#include +#include + +#include "io/hdf_archive.h" + +#include "AFQMC/config.h" +#include "AFQMC/Memory/utilities.hpp" +#include "AFQMC/Utilities/taskgroup.h" +#include "AFQMC/Numerics/ma_operations.hpp" + +#include "AFQMC/Hamiltonians/OneBodyHamiltonian.hpp" + +#include "AFQMC/HamiltonianOperations/HamiltonianOperations.hpp" + +namespace qmcplusplus +{ + +namespace afqmc +{ + +class RealDenseHamiltonian: public OneBodyHamiltonian +{ + + public: + + RealDenseHamiltonian(AFQMCInfo const& info, xmlNodePtr cur, + boost::multi::array&& h, + TaskGroup_& tg_, ValueType nucE=0, ValueType fzcE=0): + OneBodyHamiltonian(info,std::move(h),nucE,fzcE), + TG(tg_),fileName(""),batched("no"),ooc("no") + { + + if(number_of_devices() > 0) batched="yes"; + std::string str("yes"); + ParameterSet m_param; + m_param.add(fileName,"filename","std::string"); + if(TG.TG_local().size() == 1) + m_param.add(batched,"batched","std::string"); + if(TG.TG_local().size() == 1) + m_param.add(ooc,"ooc","std::string"); + m_param.put(cur); + + if(omp_get_num_threads() > 1 && (batched != "yes" && batched != "true")) { + app_log()<<" WARNING!!!: Found OMP_NUM_THREADS > 1 with batched=no.\n" + <<" This will lead to low performance. Set batched=yes. \n"; + } + } + + ~RealDenseHamiltonian() {} + + RealDenseHamiltonian(RealDenseHamiltonian const& other) = delete; + RealDenseHamiltonian(RealDenseHamiltonian && other) = default; + RealDenseHamiltonian& operator=(RealDenseHamiltonian const& other) = delete; + RealDenseHamiltonian& operator=(RealDenseHamiltonian && other) = default; + + ValueType getNuclearCoulombEnergy() const { return OneBodyHamiltonian::NuclearCoulombEnergy; } + + boost::multi::array getH1() const{ return OneBodyHamiltonian::getH1(); } + + HamiltonianOperations getHamiltonianOperations(bool pureSD, bool addCoulomb, WALKER_TYPES type, + std::vector& PsiT, double cutvn, double cutv2, + TaskGroup_& TGprop, TaskGroup_& TGwfn, hdf_archive& dump); + + ValueType H(IndexType I, IndexType J) const + { return OneBodyHamiltonian::H(I,J); } + + // this should never be used outside initialization routines. + ValueType H(IndexType I, IndexType J, IndexType K, IndexType L) const + { + APP_ABORT("Error: Calling H(I,J,K,L) in THCHamiltonian. \n"); + return ValueType(0.0); + } + + protected: + + // for hamiltonian distribution + TaskGroup_& TG; + + std::string fileName; + + std::string batched; + + std::string ooc; + + +}; + +} +} + +#endif + diff --git a/src/AFQMC/Hamiltonians/RealDenseHamiltonian_v2.cpp b/src/AFQMC/Hamiltonians/RealDenseHamiltonian_v2.cpp new file mode 100644 index 0000000000..0c6de20764 --- /dev/null +++ b/src/AFQMC/Hamiltonians/RealDenseHamiltonian_v2.cpp @@ -0,0 +1,250 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#if defined(USE_MPI) +#include +#endif + +#include "Configuration.h" +#include "type_traits/container_traits_multi.h" +#include "io/hdf_multi.h" +#include "io/hdf_archive.h" + +#include "AFQMC/config.h" +#include "AFQMC/Utilities/Utils.hpp" +#include "AFQMC/Utilities/kp_utilities.hpp" +#include "AFQMC/Hamiltonians/RealDenseHamiltonian_v2.h" +#include "AFQMC/SlaterDeterminantOperations/rotate.hpp" + +namespace qmcplusplus +{ + +namespace afqmc +{ + +#ifndef QMC_COMPLEX + +HamiltonianOperations RealDenseHamiltonian_v2::getHamiltonianOperations(bool pureSD, + bool addCoulomb, WALKER_TYPES type, std::vector& PsiT, + double cutvn, double cutv2, TaskGroup_& TGprop, TaskGroup_& TGwfn, + hdf_archive& hdf_restart) { + + using shmIMatrix = boost::multi::array>; + using shmCVector = boost::multi::array>; + using shmCMatrix = boost::multi::array>; + using shmRMatrix = boost::multi::array>; + using shmCTensor = boost::multi::array>; + using shmSpMatrix = boost::multi::array>; + using shmSpRMatrix = boost::multi::array>; + using shmSp3Tensor = boost::multi::array>; + using IVector = boost::multi::array; + using CMatrix = boost::multi::array; + using SpMatrix = boost::multi::array; + using SpVector_ref = boost::multi::array_ref; + using SpMatrix_ref = boost::multi::array_ref; + using SpRMatrix = boost::multi::array; + using SpRMatrix_ref = boost::multi::array_ref; + using CMatrix_ref = boost::multi::array_ref; + using RMatrix_ref = boost::multi::array_ref; + using Sp3Tensor_ref = boost::multi::array_ref; + + if(type==COLLINEAR) + assert(PsiT.size()%2 == 0); + int nspins = ((type!=COLLINEAR)?1:2); + int ndet = PsiT.size()/nspins; + int nup = PsiT[0].size(0); + int ndown = 0; + if(nspins == 2) ndown = PsiT[1].size(0); + int NEL = nup+ndown; + + hdf_archive dump(TG.Global()); + // right now only Node.root() reads + if( TG.Node().root() ) { + if(!dump.open(fileName,H5F_ACC_RDONLY)) { + app_error()<<" Error opening integral file in THCHamiltonian. \n"; + APP_ABORT(""); + } + if(!dump.push("Hamiltonian",false)) { + app_error()<<" Error in THCHamiltonian::getHamiltonianOperations():" + <<" Group not Hamiltonian found. \n"; + APP_ABORT(""); + } + } + + std::vector Idata(8); + if( TG.Global().root() ) { + if(!dump.readEntry(Idata,"dims")) { + app_error()<<" Error in RealDenseHamiltonian_v2::getHamiltonianOperations():" + <<" Problems reading dims. \n"; + APP_ABORT(""); + } + } + TG.Global().broadcast_n(Idata.begin(),8,0); + + ValueType E0; + if( TG.Global().root() ) { + std::vector E_(2); + if(!dump.readEntry(E_,"Energies")) { + app_error()<<" Error in RealDenseHamiltonian_v2::getHamiltonianOperations():" + <<" Problems reading Energies. \n"; + APP_ABORT(""); + } + E0 = E_[0]+E_[1]; + } + TG.Global().broadcast_n(&E0,1,0); + + int global_ncvecs = Idata[7]; + int nc0, ncN; + int node_number = TGprop.getLocalGroupNumber(); + int nnodes_prt_TG = TGprop.getNGroupsPerTG(); + std::tie(nc0,ncN) = FairDivideBoundary(node_number,global_ncvecs,nnodes_prt_TG); + int local_ncv = ncN-nc0; + + shmRMatrix H1({NMO,NMO},shared_allocator{TG.Node()}); + shmSpRMatrix Likn({NMO*NMO,local_ncv}, shared_allocator{TG.Node()}); + + if( TG.Node().root() ) { + // now read H1, use ref to avoid issues with shared pointers! + RMatrix_ref h1_(to_address(H1.origin()),H1.extensions()); + if(!dump.readEntry(h1_,std::string("hcore"))) { + app_error()<<" Error in RealDenseHamiltonian_v2::getHamiltonianOperations():" + <<" Problems reading /Hamiltonian/hcore. \n"; + APP_ABORT(""); + } + // read L + if(!dump.push("DenseFactorized",false)) { + app_error()<<" Error in RealDenseHamiltonian_v2::getHamiltonianOperations():" + <<" Group DenseFactorized not found. \n"; + APP_ABORT(""); + } + SpRMatrix_ref L(to_address(Likn.origin()),Likn.extensions()); + hyperslab_proxy hslab(L,std::array{NMO*NMO,global_ncvecs}, + std::array{NMO*NMO,local_ncv}, + std::array{0,nc0}); + if(!dump.readEntry(hslab,std::string("L"))) { + app_error()<<" Error in RealDenseHamiltonian_v2::getHamiltonianOperations():" + <<" Problems reading /Hamiltonian/DenseFactorized/L. \n"; + APP_ABORT(""); + } + if(Likn.size(0) != NMO*NMO || Likn.size(1) != local_ncv) { + app_error()<<" Error in RealDenseHamiltonian_v2::getHamiltonianOperations():" + <<" Problems reading /Hamiltonian/DenseFactorized/L. \n" + <<" Unexpected dimensins: " <{TG.Node()}); + shmCMatrix haj({ndet,NEL*NMO},shared_allocator{TG.Node()}); + std::vector Lnak; + Lnak.reserve(PsiT.size()); + for(int nd=0; nd{TG.Node()})); + int nrow = NEL; + if(ndet > 1) nrow = 0; // not used if ndet>1 + TG.Node().barrier(); + + // for simplicity + CMatrix H1C({NMO,NMO}); + copy_n_cast(to_address(H1.origin()),NMO*NMO,H1C.origin()); + + int nt=0; + for(int nd=0; nd0) ma::product(PsiT[2*nd+1],H1C,hbj_r); + } else { + CMatrix_ref haj_r(to_address(haj[nd].origin()),{nup,NMO}); + ma::product(ComplexType(2.0),PsiT[nd],H1C,ComplexType(0.0),haj_r); + } + } + // Generate Lnak + // distribute work over equivalent nodes in TGprop.TG() accross TG.Global() + auto Qcomm(TG.Global().split(TGprop.getLocalGroupNumber(),TG.Global().rank())); + auto Qcomm_roots(Qcomm.split(TGprop.Node().rank(),Qcomm.rank())); + { + CMatrix lik({NMO,NMO}); + CMatrix lak({nup,NMO}); + for(int nd=0; nd(Likn[ik][nc]),0.0); + ma::product(PsiT[nspins*nd], lik, lak); + copy_n_cast(lak.origin(),nup*NMO,to_address(Lnak[nspins*nd][nc].origin())); + if(type==COLLINEAR) { + ma::product(PsiT[2*nd+1], lik, lak.sliced(0,ndown)); + copy_n_cast(lak.origin(),ndown*NMO,to_address(Lnak[2*nd+1][nc].origin())); + } + } + } + } + TG.Global().barrier(); + if(TG.Node().root()) { + for(auto& v:Lnak) + Qcomm_roots.all_reduce_in_place_n(to_address(v.origin()),v.num_elements(),std::plus<>()); + std::fill_n(to_address(vn0.origin()),vn0.num_elements(),ComplexType(0.0)); + } + TG.Node().barrier(); + + // calculate vn0(i,l) = -0.5 sum_j sum_n L[i][j][n] L[j][l][n] = -0.5 sum_j sum_n L[i][j][n] L[l][j][n] + { + int i0,iN; + std::tie(i0,iN) = FairDivideBoundary(Qcomm.rank(),NMO,Qcomm.size()); + if(iN>i0) { + SpRMatrix v_({iN-i0,NMO}); + SpRMatrix_ref Lijn(to_address(Likn.origin()),{NMO,NMO*local_ncv}); + ma::product(-0.5,Lijn.sliced(i0,iN),ma::T(Lijn),0.0,v_); + copy_n_cast( v_.origin(), v_.num_elements(), to_address(vn0[i0].origin())); + } + } + TG.Node().barrier(); + + if( TG.Node().root() ) { + Qcomm_roots.all_reduce_in_place_n(to_address(vn0.origin()), + vn0.num_elements(),std::plus<>()); + dump.pop(); + dump.close(); + } + +// if(TG.TG_local().size() > 1 || not (batched=="yes" || batched == "true" )) +// return HamiltonianOperations(Real3IndexFactorization(TGwfn,type,std::move(H1),std::move(haj), +// std::move(Likn),std::move(Lakn),std::move(Lank),std::move(vn0),E0,nc0,global_ncvecs)); +// else + return HamiltonianOperations(Real3IndexFactorization_batched_v2(type,std::move(H1),std::move(haj), + std::move(Likn),std::move(Lnak),std::move(vn0),E0,device_allocator{}, + nc0,global_ncvecs,max_memory_MB)); + +} + +#else + +HamiltonianOperations RealDenseHamiltonian_v2::getHamiltonianOperations(bool pureSD, + bool addCoulomb, WALKER_TYPES type, std::vector& PsiT, + double cutvn, double cutv2, TaskGroup_& TGprop, TaskGroup_& TGwfn, + hdf_archive& hdf_restart) { + APP_ABORT(" Error: RealDenseHamiltonian_v2 requires complex build (QMC_COMPLEX=1). \n"); + return HamiltonianOperations(); +} + + +#endif + +} +} diff --git a/src/AFQMC/Hamiltonians/RealDenseHamiltonian_v2.h b/src/AFQMC/Hamiltonians/RealDenseHamiltonian_v2.h new file mode 100644 index 0000000000..f0b8b386ec --- /dev/null +++ b/src/AFQMC/Hamiltonians/RealDenseHamiltonian_v2.h @@ -0,0 +1,106 @@ + +#ifndef QMCPLUSPLUS_AFQMC_REALDENSEHAMILTONIAN_V2_H +#define QMCPLUSPLUS_AFQMC_REALDENSEHAMILTONIAN_V2_H + +#include +#include +#include +#include + +#include "io/hdf_archive.h" + +#include "AFQMC/config.h" +#include "AFQMC/Memory/utilities.hpp" +#include "AFQMC/Utilities/taskgroup.h" +#include "AFQMC/Numerics/ma_operations.hpp" + +#include "AFQMC/Hamiltonians/OneBodyHamiltonian.hpp" + +#include "AFQMC/HamiltonianOperations/HamiltonianOperations.hpp" + +namespace qmcplusplus +{ + +namespace afqmc +{ + +class RealDenseHamiltonian_v2: public OneBodyHamiltonian +{ + + public: + + RealDenseHamiltonian_v2(AFQMCInfo const& info, xmlNodePtr cur, + boost::multi::array&& h, + TaskGroup_& tg_, ValueType nucE=0, ValueType fzcE=0): + OneBodyHamiltonian(info,std::move(h),nucE,fzcE), + TG(tg_),fileName(""),batched("no"),ooc("no"),max_memory_MB(2000) + { + + if(number_of_devices() > 0) batched="yes"; + std::string str("yes"); + ParameterSet m_param; + m_param.add(fileName,"filename","std::string"); + m_param.add(max_memory_MB,"max_memory","int"); + if(TG.TG_local().size() == 1) + m_param.add(batched,"batched","std::string"); + if(TG.TG_local().size() == 1) + m_param.add(ooc,"ooc","std::string"); + m_param.put(cur); + + if(omp_get_num_threads() > 1 && (batched != "yes" && batched != "true")) { + app_log()<<" WARNING!!!: Found OMP_NUM_THREADS > 1 with batched=no.\n" + <<" This will lead to low performance. Set batched=yes. \n"; + } + + if( TG.TG_local().size() > 1 || not (batched=="yes" || batched == "true" )) { + app_log()<<" alt_version = true only on GPU build with batched=true. " < getH1() const{ return OneBodyHamiltonian::getH1(); } + + HamiltonianOperations getHamiltonianOperations(bool pureSD, bool addCoulomb, WALKER_TYPES type, + std::vector& PsiT, double cutvn, double cutv2, + TaskGroup_& TGprop, TaskGroup_& TGwfn, hdf_archive& dump); + + ValueType H(IndexType I, IndexType J) const + { return OneBodyHamiltonian::H(I,J); } + + // this should never be used outside initialization routines. + ValueType H(IndexType I, IndexType J, IndexType K, IndexType L) const + { + APP_ABORT("Error: Calling H(I,J,K,L) in THCHamiltonian. \n"); + return ValueType(0.0); + } + + protected: + + // for hamiltonian distribution + TaskGroup_& TG; + + std::string fileName; + + std::string batched; + + std::string ooc; + + int max_memory_MB; + +}; + +} +} + +#endif + diff --git a/src/AFQMC/Memory/SharedMemory/shm_ptr_with_raw_ptr_dispatch.hpp b/src/AFQMC/Memory/SharedMemory/shm_ptr_with_raw_ptr_dispatch.hpp index 4f506c1272..a7f0635714 100644 --- a/src/AFQMC/Memory/SharedMemory/shm_ptr_with_raw_ptr_dispatch.hpp +++ b/src/AFQMC/Memory/SharedMemory/shm_ptr_with_raw_ptr_dispatch.hpp @@ -168,7 +168,7 @@ shm_ptr_with_raw_ptr_dispatch uninitialized_fill_n(shm_ptr_with_raw_ptr_dispa if(first.wSP_->get_group().root()) std::uninitialized_fill_n(to_address(first), n, val); // change to to_pointer first.wSP_->fence(); first.wSP_->fence(); - mpi3::communicator(first.wSP_->get_group()).barrier(); + first.wSP_->comm_.barrier(); return first + n; } @@ -178,7 +178,7 @@ shm_ptr_with_raw_ptr_dispatch uninitialized_fill_n(Alloc &a, shm_ptr_with_raw if(first.wSP_->get_group().root()) std::uninitialized_fill_n(to_address(first), n, val); // change to to_pointer first.wSP_->fence(); first.wSP_->fence(); - mpi3::communicator(first.wSP_->get_group()).barrier(); + first.wSP_->comm_.barrier(); return first + n; } @@ -191,7 +191,7 @@ shm_ptr_with_raw_ptr_dispatch destroy_n(shm_ptr_with_raw_ptr_dispatch firs } first.wSP_->fence(); first.wSP_->fence(); - mpi3::communicator(first.wSP_->get_group()).barrier(); + first.wSP_->comm_.barrier(); return first + n; } @@ -204,7 +204,7 @@ shm_ptr_with_raw_ptr_dispatch destroy_n(Alloc &a, shm_ptr_with_raw_ptr_dispat } first.wSP_->fence(); first.wSP_->fence(); - mpi3::communicator(first.wSP_->get_group()).barrier(); + first.wSP_->comm_.barrier(); return first + n; } @@ -214,7 +214,7 @@ shm_ptr_with_raw_ptr_dispatch copy_n(It1 first, Size n, shm_ptr_with_raw_ptr_ using std::copy_n; if(d_first.wSP_->get_group().root()) copy_n(first, n, to_address(d_first)); d_first.wSP_->fence(); - mpi3::communicator(d_first.wSP_->get_group()).barrier(); + d_first.wSP_->comm_.barrier(); return d_first + n; } @@ -224,7 +224,7 @@ shm_ptr_with_raw_ptr_dispatch copy(It1 first, It1 last, shm_ptr_with_raw_ptr_ using std::copy; if(d_first.wSP_->get_group().root()) copy(first, last, to_address(d_first)); first.wSP_->fence(); - mpi3::communicator(d_first.wSP_->get_group()).barrier(); + d_first.wSP_->comm_.barrier(); using std::distance; return d_first + distance(first, last); } @@ -235,7 +235,7 @@ shm_ptr_with_raw_ptr_dispatch uninitialized_copy_n(It1 f, Size n, shm_ptr_wit using std::uninitialized_copy_n; if(d.wSP_->get_group().root()) uninitialized_copy_n(f, n, to_address(d)); f.wSP_->fence(); - mpi3::communicator(d.wSP_->get_group()).barrier(); + d.wSP_->comm_.barrier(); return d + n; } @@ -245,7 +245,7 @@ shm_ptr_with_raw_ptr_dispatch uninitialized_copy_n(Alloc &a, It1 f, Size n, s using std::uninitialized_copy_n; if(d.wSP_->get_group().root()) uninitialized_copy_n(f, n, to_address(d)); f.wSP_->fence(); - mpi3::communicator(d.wSP_->get_group()).barrier(); + d.wSP_->comm_.barrier(); return d + n; } @@ -256,7 +256,7 @@ shm_ptr_with_raw_ptr_dispatch uninitialized_copy(It1 f, It1 l, shm_ptr_with_r using std::uninitialized_copy; if(d.wSP_->get_group().root()) uninitialized_copy(f, l, to_address(d)); d.wSP_->fence(); - mpi3::communicator(d.wSP_->get_group()).barrier(); + d.wSP_->comm_.barrier(); using std::distance; return d + distance(f, l); } @@ -271,7 +271,7 @@ shm_ptr_with_raw_ptr_dispatch uninitialized_default_construct_n(shm_ptr_with_ }catch(...) {throw;} // leak! } f.wSP_->fence(); - mpi3::communicator(f.wSP_->get_group()).barrier(); + f.wSP_->comm_.barrier(); return f + n; } @@ -285,7 +285,7 @@ shm_ptr_with_raw_ptr_dispatch uninitialized_value_construct_n(shm_ptr_with_ra }catch(...){throw;} // leak !! } f.wSP_->fence(); - mpi3::communicator(f.wSP_->get_group()).barrier(); + f.wSP_->comm_.barrier(); return f + n; } @@ -300,7 +300,7 @@ shm_ptr_with_raw_ptr_dispatch uninitialized_default_construct_n(Alloc &a, shm }catch(...) {throw;} // leak! } f.wSP_->fence(); - mpi3::communicator(f.wSP_->get_group()).barrier(); + f.wSP_->comm_.barrier(); return f + n; } @@ -315,7 +315,7 @@ shm_ptr_with_raw_ptr_dispatch uninitialized_value_construct_n(Alloc &a, shm_p }catch(...){throw;} // leak !! } f.wSP_->fence(); - mpi3::communicator(f.wSP_->get_group()).barrier(); + f.wSP_->comm_.barrier(); return f + n; } @@ -340,7 +340,7 @@ multi::array_iterator> uninitialized } base(first).wSP_->fence(); base(first).wSP_->fence(); - mpi3::communicator(base(first).wSP_->get_group()).barrier(); + base(first).wSP_->comm_.barrier(); return first + n; } @@ -371,7 +371,7 @@ multi::array_iterator> copy_n( } base(dest).wSP_->fence(); base(dest).wSP_->fence(); - mpi3::communicator(base(dest).wSP_->get_group()).barrier(); + base(dest).wSP_->comm_.barrier(); return dest + n; } @@ -390,7 +390,7 @@ multi::array_iterator> copy_n( } base(dest).wSP_->fence(); base(dest).wSP_->fence(); - mpi3::communicator(base(dest).wSP_->get_group()).barrier(); + base(dest).wSP_->comm_.barrier(); return dest + n; } @@ -463,7 +463,7 @@ multi::array_iterator> uninitialized } base(dest).wSP_->fence(); base(dest).wSP_->fence(); - mpi3::communicator(base(dest).wSP_->get_group()).barrier(); + base(dest).wSP_->comm_.barrier(); return dest + n; } @@ -495,7 +495,7 @@ multi::array_iterator uninitialized_copy( a.construct(addressof(*d), *first); }catch(...){throw;} } - mpi3::communicator(base(first).wSP_->get_group()).barrier(); + base(first).wSP_->comm_.barrier(); return dest + std::distance(first,last); } diff --git a/src/AFQMC/Memory/raw_pointers.hpp b/src/AFQMC/Memory/raw_pointers.hpp index 87d9923885..d8ad14f7c7 100644 --- a/src/AFQMC/Memory/raw_pointers.hpp +++ b/src/AFQMC/Memory/raw_pointers.hpp @@ -50,6 +50,22 @@ namespace afqmc { return B; } + /************* inplace_cast ****************/ + template + void inplace_cast(T* A, Size n) { + Q *B(reinterpret_cast(A)); + if( sizeof(T) >= sizeof(Q) ) { + for(Size i=0; i(*A); + } else if(sizeof(T) < sizeof(Q)) { + assert( sizeof(T)*2 <= sizeof(Q)); + A += (n-1); + B += (n-1); + for(; n>0; n--, --A, --B) + *B = static_cast(*A); + } + } + /************* fill2D ****************/ template void fill2D(Size N, Size M, T* y, Size lda, T const a) diff --git a/src/AFQMC/Numerics/detail/CPU/blas_cpu.hpp b/src/AFQMC/Numerics/detail/CPU/blas_cpu.hpp index fff72419fc..afd3286639 100644 --- a/src/AFQMC/Numerics/detail/CPU/blas_cpu.hpp +++ b/src/AFQMC/Numerics/detail/CPU/blas_cpu.hpp @@ -283,7 +283,7 @@ namespace ma dgemm('N','T',2,n,m,alpha,reinterpret_cast(x),2*incx,amat,lda, beta,reinterpret_cast(y),2*incy); else if(trans_in == 't' || trans_in == 'T') - dgemm('N','N',2,n,m,alpha,reinterpret_cast(x),2*incx,amat,lda, + dgemm('N','N',2,m,n,alpha,reinterpret_cast(x),2*incx,amat,lda, beta,reinterpret_cast(y),2*incy); else { print_stacktrace @@ -304,7 +304,7 @@ namespace ma sgemm('N','T',2,n,m,alpha,reinterpret_cast(x),2*incx,amat,lda, beta,reinterpret_cast(y),2*incy); else if(trans_in == 't' || trans_in == 'T') - sgemm('N','N',2,n,m,alpha,reinterpret_cast(x),2*incx,amat,lda, + sgemm('N','N',2,m,n,alpha,reinterpret_cast(x),2*incx,amat,lda, beta,reinterpret_cast(y),2*incy); else { print_stacktrace diff --git a/src/AFQMC/Numerics/detail/CUDA/Kernels/Tab_to_Kl.cu b/src/AFQMC/Numerics/detail/CUDA/Kernels/Tab_to_Kl.cu new file mode 100644 index 0000000000..0ba23f3b52 --- /dev/null +++ b/src/AFQMC/Numerics/detail/CUDA/Kernels/Tab_to_Kl.cu @@ -0,0 +1,116 @@ +////////////////////////////////////////////////////////////////////// +// This file is distributed under the University of Illinois/NCSA Open Source +// License. See LICENSE file in top directory for details. +// +// Copyright (c) 2016 Jeongnim Kim and QMCPACK developers. +// +// File developed by: +// Lawrence Livermore National Laboratory +// +// File created by: +// Miguel A. Morales, moralessilva2@llnl.gov +// Lawrence Livermore National Laboratory +//////////////////////////////////////////////////////////////////////////////// + +#include +#include +#include +#include +#include +#include "AFQMC/Numerics/detail/CUDA/Kernels/cuda_settings.h" +#define ENABLE_CUDA 1 +#include "AFQMC/Memory/CUDA/cuda_utilities.h" + +namespace kernels +{ + +// Tab [nbatch][nwalk][nocc][nocc][nchol] +// Klr [nwalk][2*nchol_tot] +template +__global__ void kernel_Tab_to_Kl(int nwalk, int nocc, int nchol, + thrust::complex const* Tab, + thrust::complex * Kl) +{ + int w = blockIdx.x; + if(w < nwalk) { + for(int a=0; a const* Tab_(Tab + ((w*nocc+a)*nocc + a)*nchol); + thrust::complex * Kl_(Kl + w*nchol); + int c = threadIdx.x; + while( c < nchol ) { + Kl_[c] += Tab_[c]; + c += blockDim.x; + } + } + } +} + +template +__global__ void kernel_Tanb_to_Kl(int nwalk, int nocc, int nchol, int nchol_tot, + thrust::complex const* Tab, + thrust::complex * Kl) +{ + int w = blockIdx.x; + if(w < nwalk) { + for(int a=0; a const* Tab_(Tab + ((w*nocc+a)*nocc)*nchol+a); + thrust::complex * Kl_(Kl + w*nchol_tot); + int c = threadIdx.x; + while( c < nchol ) { + Kl_[c] += Tab_[c*nocc]; + c += blockDim.x; + } + } + } +} + +void Tab_to_Kl(int nwalk, int nocc, int nchol,std::complex const* Tab, + std::complex * Kl) +{ + dim3 grid_dim(nwalk,1,1); + int nthr = std::min(256,nchol); + kernel_Tab_to_Kl<<>>(nwalk,nocc,nchol, + reinterpret_cast const*>(Tab), + reinterpret_cast *>(Kl)); + qmc_cuda::cuda_check(cudaGetLastError(),"Tab_to_Kl"); + qmc_cuda::cuda_check(cudaDeviceSynchronize(),"Tab_to_Kl"); +} + +void Tab_to_Kl(int nwalk, int nocc, int nchol,std::complex const* Tab, + std::complex * Kl) +{ + dim3 grid_dim(nwalk,1,1); + int nthr = std::min(256,nchol); + kernel_Tab_to_Kl<<>>(nwalk,nocc,nchol, + reinterpret_cast const*>(Tab), + reinterpret_cast *>(Kl)); + qmc_cuda::cuda_check(cudaGetLastError(),"Tab_to_Kl"); + qmc_cuda::cuda_check(cudaDeviceSynchronize(),"Tab_to_Kl"); +} + + +void Tanb_to_Kl(int nwalk, int nocc, int nchol, int nchol_tot, + std::complex const* Tab, std::complex * Kl) +{ + dim3 grid_dim(nwalk,1,1); + int nthr = std::min(256,nchol); + kernel_Tanb_to_Kl<<>>(nwalk,nocc,nchol,nchol_tot, + reinterpret_cast const*>(Tab), + reinterpret_cast *>(Kl)); + qmc_cuda::cuda_check(cudaGetLastError(),"Tab_to_Kl"); + qmc_cuda::cuda_check(cudaDeviceSynchronize(),"Tab_to_Kl"); +} + +void Tanb_to_Kl(int nwalk, int nocc, int nchol, int nchol_tot, + std::complex const* Tab, std::complex * Kl) +{ + dim3 grid_dim(nwalk,1,1); + int nthr = std::min(256,nchol); + kernel_Tanb_to_Kl<<>>(nwalk,nocc,nchol,nchol_tot, + reinterpret_cast const*>(Tab), + reinterpret_cast *>(Kl)); + qmc_cuda::cuda_check(cudaGetLastError(),"Tab_to_Kl"); + qmc_cuda::cuda_check(cudaDeviceSynchronize(),"Tab_to_Kl"); +} + +} diff --git a/src/AFQMC/Numerics/detail/CUDA/Kernels/Tab_to_Kl.cuh b/src/AFQMC/Numerics/detail/CUDA/Kernels/Tab_to_Kl.cuh new file mode 100644 index 0000000000..634394bb89 --- /dev/null +++ b/src/AFQMC/Numerics/detail/CUDA/Kernels/Tab_to_Kl.cuh @@ -0,0 +1,38 @@ +////////////////////////////////////////////////////////////////////// +// This file is distributed under the University of Illinois/NCSA Open Source +// License. See LICENSE file in top directory for details. +// +// Copyright (c) 2016 Jeongnim Kim and QMCPACK developers. +// +// File developed by: +// Lawrence Livermore National Laboratory +// +// File created by: +// Miguel A. Morales, moralessilva2@llnl.gov +// Lawrence Livermore National Laboratory +//////////////////////////////////////////////////////////////////////////////// + +#ifndef AFQMC_TAB_TO_KL_H +#define AFQMC_TAB_TO_KL_H + +#include +#include + +namespace kernels +{ + +void Tab_to_Kl(int nwalk, int nocc, int nchol, std::complex const* Tab, + std::complex * Kl); + +void Tab_to_Kl(int nwalk, int nocc, int nchol, std::complex const* Tab, + std::complex * Kl); + +void Tanb_to_Kl(int nwalk, int nocc, int nchol, int nchol_tot, std::complex const* Tab, + std::complex * Kl); + +void Tanb_to_Kl(int nwalk, int nocc, int nchol, int nchol_tot, std::complex const* Tab, + std::complex * Kl); + +} + +#endif diff --git a/src/AFQMC/Numerics/detail/CUDA/Kernels/batched_Tab_to_Klr.cu b/src/AFQMC/Numerics/detail/CUDA/Kernels/batched_Tab_to_Klr.cu index 60ac44224e..240afed5c5 100644 --- a/src/AFQMC/Numerics/detail/CUDA/Kernels/batched_Tab_to_Klr.cu +++ b/src/AFQMC/Numerics/detail/CUDA/Kernels/batched_Tab_to_Klr.cu @@ -69,6 +69,49 @@ __global__ void kernel_batched_Tab_to_Klr(int nterms, int nwalk, int nocc, int n } } +template +__global__ void kernel_batched_Tanb_to_Klr(int nterms, int nwalk, int nocc, int nchol_max, + int nchol_tot, int ncholQ, int ncholQ0, int* kdiag, + thrust::complex const* Tab, + thrust::complex * Kl, + thrust::complex * Kr) +{ + int w = blockIdx.x; + if(blockIdx.y == 0) { + for( int k=0; k const* Tba_(Tab + batch*nwalk*nocc*nocc*nchol_max + + ((w*nocc+a)*nocc)*nchol_max + a); + thrust::complex * Kr_(Kr + w*nchol_tot + ncholQ0); + int c = threadIdx.x; + while( c < ncholQ ) { + Kr_[c] += Tba_[c*nocc]; + c += blockDim.x; + } + } + } + } + } else if(blockIdx.y == 1) { + for( int k=0; k const* Tab_(Tab + (batch+1)*nwalk*nocc*nocc*nchol_max + + ((w*nocc+a)*nocc)*nchol_max + a); + thrust::complex * Kl_(Kl + w*nchol_tot + ncholQ0); + int c = threadIdx.x; + while( c < ncholQ ) { + Kl_[c] += Tab_[c*nocc]; + c += blockDim.x; + } + } + } + } + } +} + void batched_Tab_to_Klr(int nterms, int nwalk, int nocc, int nchol_max, int nchol_tot, int ncholQ, int ncholQ0, int* kdiag, std::complex const* Tab, @@ -103,5 +146,38 @@ void batched_Tab_to_Klr(int nterms, int nwalk, int nocc, int nchol_max, qmc_cuda::cuda_check(cudaDeviceSynchronize(),"batched_Tab_to_Klr"); } +void batched_Tanb_to_Klr(int nterms, int nwalk, int nocc, int nchol_max, + int nchol_tot, int ncholQ, int ncholQ0, int* kdiag, + std::complex const* Tab, + std::complex * Kl, + std::complex * Kr) +{ + dim3 grid_dim(nwalk,2,1); + int nthr = std::min(256,ncholQ); // is this needed? + kernel_batched_Tanb_to_Klr<<>>(nterms,nwalk,nocc,nchol_max,nchol_tot,ncholQ, + ncholQ0,kdiag, + reinterpret_cast const*>(Tab), + reinterpret_cast *>(Kl), + reinterpret_cast *>(Kr)); + qmc_cuda::cuda_check(cudaGetLastError(),"batched_Tanb_to_Klr"); + qmc_cuda::cuda_check(cudaDeviceSynchronize(),"batched_Tanb_to_Klr"); +} + +void batched_Tanb_to_Klr(int nterms, int nwalk, int nocc, int nchol_max, + int nchol_tot, int ncholQ, int ncholQ0, int* kdiag, + std::complex const* Tab, + std::complex * Kl, + std::complex * Kr) +{ + dim3 grid_dim(nwalk,2,1); + int nthr = std::min(256,ncholQ); // is this needed? + kernel_batched_Tanb_to_Klr<<>>(nterms,nwalk,nocc,nchol_max,nchol_tot,ncholQ, + ncholQ0,kdiag, + reinterpret_cast const*>(Tab), + reinterpret_cast *>(Kl), + reinterpret_cast *>(Kr)); + qmc_cuda::cuda_check(cudaGetLastError(),"batched_Tanb_to_Klr"); + qmc_cuda::cuda_check(cudaDeviceSynchronize(),"batched_Tanb_to_Klr"); +} } diff --git a/src/AFQMC/Numerics/detail/CUDA/Kernels/batched_Tab_to_Klr.cuh b/src/AFQMC/Numerics/detail/CUDA/Kernels/batched_Tab_to_Klr.cuh index 0ee62e606d..d9c6ab03f5 100644 --- a/src/AFQMC/Numerics/detail/CUDA/Kernels/batched_Tab_to_Klr.cuh +++ b/src/AFQMC/Numerics/detail/CUDA/Kernels/batched_Tab_to_Klr.cuh @@ -33,6 +33,18 @@ void batched_Tab_to_Klr(int nterms, int nwalk, int nocc, int nchol_max, std::complex * Kl, std::complex * Kr); +void batched_Tanb_to_Klr(int nterms, int nwalk, int nocc, int nchol_max, + int nchol_tot, int ncholQ, int ncholQ0, int* kdiag, + std::complex const* Tab, + std::complex * Kl, + std::complex * Kr); + +void batched_Tanb_to_Klr(int nterms, int nwalk, int nocc, int nchol_max, + int nchol_tot, int ncholQ, int ncholQ0, int* kdiag, + std::complex const* Tab, + std::complex * Kl, + std::complex * Kr); + } #endif diff --git a/src/AFQMC/Numerics/detail/CUDA/Kernels/batched_dot_wabn_wban.cu b/src/AFQMC/Numerics/detail/CUDA/Kernels/batched_dot_wabn_wban.cu index 68b0f8774b..bf16296c16 100644 --- a/src/AFQMC/Numerics/detail/CUDA/Kernels/batched_dot_wabn_wban.cu +++ b/src/AFQMC/Numerics/detail/CUDA/Kernels/batched_dot_wabn_wban.cu @@ -73,6 +73,51 @@ __global__ void kernel_batched_dot_wabn_wban(int nbatch, int nwalk, int nocc, in } } +template +__global__ void kernel_batched_dot_wanb_wbna(int nbatch, int nwalk, int nocc, int nchol, + thrust::complex const* alpha, thrust::complex const* Tab, + thrust::complex* y, int incy) +{ + int batch = blockIdx.x; + if( batch >= nbatch ) return; + if( blockIdx.y >= nwalk*nocc*nocc ) return; + __shared__ thrust::complex cache[ DOT_BLOCK_SIZE ]; + int nocc2 = nocc*nocc; + int w = blockIdx.y/(nocc2); + int a = (blockIdx.y%(nocc2))/nocc; + int b = (blockIdx.y%(nocc2))%nocc; + int i = threadIdx.x; + thrust::complex alp = static_cast>(alpha[batch]); + thrust::complex const* A_(Tab + 2*batch*nwalk*nocc2*nchol + ((w*nocc+a)*nocc)*nchol+b); + thrust::complex const* B_(Tab + (2*batch+1)*nwalk*nocc2*nchol + ((w*nocc+b)*nocc)*nchol+a); + cache[ threadIdx.x ] = thrust::complex(0.0); + while( i < nchol ) { + cache[ threadIdx.x ] += static_cast>(A_[ i*nocc ] * B_[ i*nocc ]); + i += blockDim.x; + } + __syncthreads(); // required because later on the current thread is accessing + // data written by another thread + i = DOT_BLOCK_SIZE / 2; + while( i > 0 ) { + if( threadIdx.x < i ) cache[ threadIdx.x ] += cache[ threadIdx.x + i ]; + __syncthreads(); + i /= 2; //not sure bitwise operations are actually faster + } + //if( threadIdx.x == 0 ) *(y+w*incy) = (*(y+w*incy)) + alp * cache[ 0 ]; + if( threadIdx.x == 0 ) { + T re = (alp * cache[ 0 ]).real(); + T im = (alp * cache[ 0 ]).imag(); + T* re_ = reinterpret_cast(y+w*incy); +#if __CUDA_ARCH__ < 600 + myAtomicAdd(re_,re); + myAtomicAdd(re_+1,im); +#else + atomicAdd(re_,re); + atomicAdd(re_+1,im); +#endif + } +} + void batched_dot_wabn_wban( int nbatch, int nwalk, int nocc, int nchol, std::complex const* alpha, std::complex const* Tab, std::complex* y, int incy) @@ -115,5 +160,48 @@ void batched_dot_wabn_wban( int nbatch, int nwalk, int nocc, int nchol, qmc_cuda::cuda_check(cudaDeviceSynchronize(),"batched_dot_wabn_wban"); } +// anb/bna +void batched_dot_wanb_wbna( int nbatch, int nwalk, int nocc, int nchol, + std::complex const* alpha, std::complex const* Tab, + std::complex* y, int incy) +{ + int n_=nwalk*nocc*nocc; + dim3 grid_dim(nbatch,n_,1); + kernel_batched_dot_wanb_wbna<<>>(nbatch,nwalk,nocc,nchol, + reinterpret_cast const*>(alpha), + reinterpret_cast const*>(Tab), + reinterpret_cast *>(y),incy); + qmc_cuda::cuda_check(cudaGetLastError(),"batched_dot_wanb_wbna"); + qmc_cuda::cuda_check(cudaDeviceSynchronize(),"batched_dot_wanb_wbna"); +} + +void batched_dot_wanb_wbna( int nbatch, int nwalk, int nocc, int nchol, + std::complex const* alpha, std::complex const* Tab, + std::complex* y, int incy) +{ + int n_=nwalk*nocc*nocc; + dim3 grid_dim(nbatch,n_,1); + kernel_batched_dot_wanb_wbna<<>>(nbatch,nwalk,nocc,nchol, + reinterpret_cast const*>(alpha), + reinterpret_cast const*>(Tab), + reinterpret_cast *>(y),incy); + qmc_cuda::cuda_check(cudaGetLastError(),"batched_dot_wanb_wbna"); + qmc_cuda::cuda_check(cudaDeviceSynchronize(),"batched_dot_wanb_wbna"); +} + +void batched_dot_wanb_wbna( int nbatch, int nwalk, int nocc, int nchol, + std::complex const* alpha, std::complex const* Tab, + std::complex* y, int incy) +{ + int n_=nwalk*nocc*nocc; + dim3 grid_dim(nbatch,n_,1); + kernel_batched_dot_wanb_wbna<<>>(nbatch,nwalk,nocc,nchol, + reinterpret_cast const*>(alpha), + reinterpret_cast const*>(Tab), + reinterpret_cast *>(y),incy); + qmc_cuda::cuda_check(cudaGetLastError(),"batched_dot_wanb_wbna"); + qmc_cuda::cuda_check(cudaDeviceSynchronize(),"batched_dot_wanb_wbna"); +} + } diff --git a/src/AFQMC/Numerics/detail/CUDA/Kernels/batched_dot_wabn_wban.cuh b/src/AFQMC/Numerics/detail/CUDA/Kernels/batched_dot_wabn_wban.cuh index c271eefbf1..a1a7437cc1 100644 --- a/src/AFQMC/Numerics/detail/CUDA/Kernels/batched_dot_wabn_wban.cuh +++ b/src/AFQMC/Numerics/detail/CUDA/Kernels/batched_dot_wabn_wban.cuh @@ -31,6 +31,17 @@ void batched_dot_wabn_wban( int nbatch, int nwalk, int nocc, int nchol, void batched_dot_wabn_wban( int nbatch, int nwalk, int nocc, int nchol, std::complex const* alpha, std::complex const* Tab, std::complex* y, int incy); + +void batched_dot_wanb_wbna( int nbatch, int nwalk, int nocc, int nchol, + std::complex const* alpha, std::complex const* Tab, + std::complex* y, int incy); +void batched_dot_wanb_wbna( int nbatch, int nwalk, int nocc, int nchol, + std::complex const* alpha, std::complex const* Tab, + std::complex* y, int incy); +void batched_dot_wanb_wbna( int nbatch, int nwalk, int nocc, int nchol, + std::complex const* alpha, std::complex const* Tab, + std::complex* y, int incy); + } #endif diff --git a/src/AFQMC/Numerics/detail/CUDA/Kernels/dot_wabn.cu b/src/AFQMC/Numerics/detail/CUDA/Kernels/dot_wabn.cu new file mode 100644 index 0000000000..5631fb6031 --- /dev/null +++ b/src/AFQMC/Numerics/detail/CUDA/Kernels/dot_wabn.cu @@ -0,0 +1,308 @@ +////////////////////////////////////////////////////////////////////// +// This file is distributed under the University of Illinois/NCSA Open Source +// License. See LICENSE file in top directory for details. +// +// Copyright (c) 2016 Jeongnim Kim and QMCPACK developers. +// +// File developed by: +// Lawrence Livermore National Laboratory +// +// File created by: +// Miguel A. Morales, moralessilva2@llnl.gov +// Lawrence Livermore National Laboratory +//////////////////////////////////////////////////////////////////////////////// + +#include +#include +#include +#include +#include +#include "AFQMC/Numerics/detail/CUDA/Kernels/cuda_settings.h" +#define ENABLE_CUDA 1 +#include "AFQMC/Memory/CUDA/cuda_utilities.h" +#if __CUDA_ARCH__ < 600 +#include "AFQMC/Numerics/detail/CUDA/Kernels/myAtomicAdd.cu" +#endif + +namespace kernels +{ + +// Tab nwalk][nocc][nocc][nchol] +template +__global__ void kernel_dot_wabn(int nwalk, int nocc, int nchol, + thrust::complex const alpha, thrust::complex const* Tab, + thrust::complex* y, int incy) +{ + if( blockIdx.x >= nwalk*nocc*nocc ) return; + __shared__ thrust::complex cache[ DOT_BLOCK_SIZE ]; + int nocc2 = nocc*nocc; + int w = blockIdx.x/(nocc2); + int a = (blockIdx.x%(nocc2))/nocc; + int b = (blockIdx.x%(nocc2))%nocc; + int i = threadIdx.x; + thrust::complex alp = static_cast>(alpha); + thrust::complex const* A_(Tab + ((w*nocc+a)*nocc + b)*nchol); + thrust::complex const* B_(Tab + ((w*nocc+b)*nocc + a)*nchol); + cache[ threadIdx.x ] = thrust::complex(0.0); + while( i < nchol ) { + cache[ threadIdx.x ] += static_cast>(A_[ i ] * B_[ i ]); + i += blockDim.x; + } + __syncthreads(); // required because later on the current thread is accessing + // data written by another thread + i = DOT_BLOCK_SIZE / 2; + while( i > 0 ) { + if( threadIdx.x < i ) cache[ threadIdx.x ] += cache[ threadIdx.x + i ]; + __syncthreads(); + i /= 2; //not sure bitwise operations are actually faster + } + if( threadIdx.x == 0 ) { + T re = (alp * cache[ 0 ]).real(); + T im = (alp * cache[ 0 ]).imag(); + T* re_ = reinterpret_cast(y+w*incy); +#if __CUDA_ARCH__ < 600 + myAtomicAdd(re_,re); + myAtomicAdd(re_+1,im); +#else + atomicAdd(re_,re); + atomicAdd(re_+1,im); +#endif + } +} + +template +__global__ void kernel_dot_wanb(int nt, int nwalk, int nocc, int nchol, + thrust::complex const alpha, thrust::complex const* Tab, + thrust::complex* y, int incy) +{ + if( blockIdx.x >= nwalk ) return; + + int a = blockIdx.y*blockDim.x + threadIdx.x; + int nb = blockIdx.z*blockDim.y*nt + threadIdx.y; + + __shared__ thrust::complex cache[ 1024 ]; + + int nid = blockDim.x*blockDim.y*blockDim.z; + int id = (threadIdx.x * blockDim.y + threadIdx.y) *blockDim.z + threadIdx.z; + + cache[ id ] = thrust::complex(0.0); + thrust::complex alp = static_cast>(alpha); + thrust::complex const* A_(Tab + blockIdx.x*nocc*nocc*nchol); + for(int n=0; n= nocc || nb >= nocc*nchol) break; + int i = nb/nocc; + int b = nb%nocc; + cache[id] += static_cast>( A_[(a*nchol + i)*nocc + b] * + A_[(b*nchol + i)*nocc + a]); + } + + __syncthreads(); // required because later on the current thread is accessing + // data written by another thread + int i = nid / 2; + while( i > 0 ) { + if( id < i ) cache[ id ] += cache[ id + i ]; + __syncthreads(); + i /= 2; //not sure bitwise operations are actually faster + } + if( id == 0 ) { + T re = (alp * cache[ 0 ]).real(); + T im = (alp * cache[ 0 ]).imag(); + T* re_ = reinterpret_cast(y+blockIdx.x*incy); +#if __CUDA_ARCH__ < 600 + myAtomicAdd(re_,re); + myAtomicAdd(re_+1,im); +#else + atomicAdd(re_,re); + atomicAdd(re_+1,im); +#endif + } +} + +template +__global__ void kernel_dot_wanb2(int nwalk, int nocc, int nchol, + thrust::complex const alpha, thrust::complex const* Tab, + thrust::complex* y, int incy) +{ + if( blockIdx.x >= nwalk*nocc*nocc ) return; + __shared__ thrust::complex cache[ DOT_BLOCK_SIZE ]; + int nocc2 = nocc*nocc; + int w = blockIdx.x/(nocc2); + int a = (blockIdx.x%(nocc2))/nocc; + int b = (blockIdx.x%(nocc2))%nocc; + int i = threadIdx.x; + thrust::complex alp = static_cast>(alpha); + thrust::complex const* A_(Tab + ((w*nocc+a)*nocc)*nchol+b); + thrust::complex const* B_(Tab + ((w*nocc+b)*nocc)*nchol+a); + cache[ threadIdx.x ] = thrust::complex(0.0); + while( i < nchol ) { + cache[ threadIdx.x ] += static_cast>(A_[ i*nocc ] * B_[ i*nocc ]); + i += blockDim.x; + } + __syncthreads(); // required because later on the current thread is accessing + // data written by another thread + i = DOT_BLOCK_SIZE / 2; + while( i > 0 ) { + if( threadIdx.x < i ) cache[ threadIdx.x ] += cache[ threadIdx.x + i ]; + __syncthreads(); + i /= 2; //not sure bitwise operations are actually faster + } + if( threadIdx.x == 0 ) { + T re = (alp * cache[ 0 ]).real(); + T im = (alp * cache[ 0 ]).imag(); + T* re_ = reinterpret_cast(y+w*incy); +#if __CUDA_ARCH__ < 600 + myAtomicAdd(re_,re); + myAtomicAdd(re_+1,im); +#else + atomicAdd(re_,re); + atomicAdd(re_+1,im); +#endif + } +} + +void dot_wabn( int nwalk, int nocc, int nchol, + std::complex const alpha, std::complex const* Tab, + std::complex* y, int incy) +{ + int n_=nwalk*nocc*nocc; + dim3 grid_dim(n_,1,1); + kernel_dot_wabn<<>>(nwalk,nocc,nchol, + static_cast const>(alpha), + reinterpret_cast const*>(Tab), + reinterpret_cast *>(y),incy); + qmc_cuda::cuda_check(cudaGetLastError(),"dot_wabn"); + qmc_cuda::cuda_check(cudaDeviceSynchronize(),"dot_wabn"); +} + +void dot_wabn( int nwalk, int nocc, int nchol, + std::complex const alpha, std::complex const* Tab, + std::complex* y, int incy) +{ + int n_=nwalk*nocc*nocc; + dim3 grid_dim(n_,1,1); + kernel_dot_wabn<<>>(nwalk,nocc,nchol, + static_cast const>(alpha), + reinterpret_cast const*>(Tab), + reinterpret_cast *>(y),incy); + qmc_cuda::cuda_check(cudaGetLastError(),"dot_wabn"); + qmc_cuda::cuda_check(cudaDeviceSynchronize(),"dot_wabn"); +} + +void dot_wabn( int nwalk, int nocc, int nchol, + std::complex const alpha, std::complex const* Tab, + std::complex* y, int incy) +{ + int n_=nwalk*nocc*nocc; + dim3 grid_dim(n_,1,1); + kernel_dot_wabn<<>>(nwalk,nocc,nchol, + static_cast const>(alpha), + reinterpret_cast const*>(Tab), + reinterpret_cast *>(y),incy); + qmc_cuda::cuda_check(cudaGetLastError(),"dot_wabn"); + qmc_cuda::cuda_check(cudaDeviceSynchronize(),"dot_wabn"); +} + +// v2 +void dot_wanb( int nwalk, int nocc, int nchol, + std::complex const alpha, std::complex const* Tab, + std::complex* y, int incy) +{ + int a_(8); + int nf(8); + int b_ = 1024/a_; + int na = (nocc-1)/a_+1; + int nb = (nocc*nchol-1)/(b_*nf)+1; + dim3 grid_dim(nwalk,na,nb); + dim3 block_dim(a_,b_,1); + kernel_dot_wanb<<>>(nf,nwalk,nocc,nchol, + static_cast const>(alpha), + reinterpret_cast const*>(Tab), + reinterpret_cast *>(y),incy); + qmc_cuda::cuda_check(cudaGetLastError(),"dot_wanb"); + qmc_cuda::cuda_check(cudaDeviceSynchronize(),"dot_wanb"); +} + +void dot_wanb( int nwalk, int nocc, int nchol, + std::complex const alpha, std::complex const* Tab, + std::complex* y, int incy) +{ + int a_(8); + int nf(8); + int b_ = 1024/a_; + int na = (nocc-1)/a_+1; + int nb = (nocc*nchol-1)/(b_*nf)+1; + dim3 grid_dim(nwalk,na,nb); + dim3 block_dim(a_,b_,1); + kernel_dot_wanb<<>>(nf,nwalk,nocc,nchol, + static_cast const>(alpha), + reinterpret_cast const*>(Tab), + reinterpret_cast *>(y),incy); + qmc_cuda::cuda_check(cudaGetLastError(),"dot_wanb"); + qmc_cuda::cuda_check(cudaDeviceSynchronize(),"dot_wanb"); +} + +void dot_wanb( int nwalk, int nocc, int nchol, + std::complex const alpha, std::complex const* Tab, + std::complex* y, int incy) +{ + int a_(8); + int nf(8); + int b_ = 1024/a_; + int na = (nocc-1)/a_+1; + int nb = (nocc*nchol-1)/(b_*nf)+1; + dim3 grid_dim(nwalk,na,nb); + dim3 block_dim(a_,b_,1); + kernel_dot_wanb<<>>(nf,nwalk,nocc,nchol, + static_cast const>(alpha), + reinterpret_cast const*>(Tab), + reinterpret_cast *>(y),incy); + qmc_cuda::cuda_check(cudaGetLastError(),"dot_wanb"); + qmc_cuda::cuda_check(cudaDeviceSynchronize(),"dot_wanb"); +} + +/* +// v2 +void dot_wanb( int nwalk, int nocc, int nchol, + std::complex const alpha, std::complex const* Tab, + std::complex* y, int incy) +{ + int n_=nwalk*nocc*nocc; + dim3 grid_dim(n_,1,1); + kernel_dot_wanb2<<>>(nwalk,nocc,nchol, + static_cast const>(alpha), + reinterpret_cast const*>(Tab), + reinterpret_cast *>(y),incy); + qmc_cuda::cuda_check(cudaGetLastError(),"dot_wanb"); + qmc_cuda::cuda_check(cudaDeviceSynchronize(),"dot_wanb"); +} + +void dot_wanb( int nwalk, int nocc, int nchol, + std::complex const alpha, std::complex const* Tab, + std::complex* y, int incy) +{ + int n_=nwalk*nocc*nocc; + dim3 grid_dim(n_,1,1); + kernel_dot_wanb2<<>>(nwalk,nocc,nchol, + static_cast const>(alpha), + reinterpret_cast const*>(Tab), + reinterpret_cast *>(y),incy); + qmc_cuda::cuda_check(cudaGetLastError(),"dot_wanb"); + qmc_cuda::cuda_check(cudaDeviceSynchronize(),"dot_wanb"); +} + +void dot_wanb( int nwalk, int nocc, int nchol, + std::complex const alpha, std::complex const* Tab, + std::complex* y, int incy) +{ + int n_=nwalk*nocc*nocc; + dim3 grid_dim(n_,1,1); + kernel_dot_wanb2<<>>(nwalk,nocc,nchol, + static_cast const>(alpha), + reinterpret_cast const*>(Tab), + reinterpret_cast *>(y),incy); + qmc_cuda::cuda_check(cudaGetLastError(),"dot_wanb"); + qmc_cuda::cuda_check(cudaDeviceSynchronize(),"dot_wanb"); +} +*/ + +} diff --git a/src/AFQMC/Numerics/detail/CUDA/Kernels/dot_wabn.cuh b/src/AFQMC/Numerics/detail/CUDA/Kernels/dot_wabn.cuh new file mode 100644 index 0000000000..8610ca85db --- /dev/null +++ b/src/AFQMC/Numerics/detail/CUDA/Kernels/dot_wabn.cuh @@ -0,0 +1,47 @@ +////////////////////////////////////////////////////////////////////// +// This file is distributed under the University of Illinois/NCSA Open Source +// License. See LICENSE file in top directory for details. +// +// Copyright (c) 2016 Jeongnim Kim and QMCPACK developers. +// +// File developed by: +// Lawrence Livermore National Laboratory +// +// File created by: +// Miguel A. Morales, moralessilva2@llnl.gov +// Lawrence Livermore National Laboratory +//////////////////////////////////////////////////////////////////////////////// + +#ifndef AFQMC_DOT_WABN_H +#define AFQMC_DOT_WABN_H + +#include +#include +#include "AFQMC/Numerics/detail/CUDA/Kernels/cuda_settings.h" + +namespace kernels +{ + +void dot_wabn( int nwalk, int nocc, int nchol, + std::complex const alpha, std::complex const* Tab, + std::complex* y, int incy); +void dot_wabn( int nwalk, int nocc, int nchol, + std::complex const alpha, std::complex const* Tab, + std::complex* y, int incy); +void dot_wabn( int nwalk, int nocc, int nchol, + std::complex const alpha, std::complex const* Tab, + std::complex* y, int incy); + + +void dot_wanb( int nwalk, int nocc, int nchol, + std::complex const alpha, std::complex const* Tab, + std::complex* y, int incy); +void dot_wanb( int nwalk, int nocc, int nchol, + std::complex const alpha, std::complex const* Tab, + std::complex* y, int incy); +void dot_wanb( int nwalk, int nocc, int nchol, + std::complex const alpha, std::complex const* Tab, + std::complex* y, int incy); +} + +#endif diff --git a/src/AFQMC/Numerics/detail/CUDA/blas_cuda_gpu_ptr.hpp b/src/AFQMC/Numerics/detail/CUDA/blas_cuda_gpu_ptr.hpp index b56e248de8..1b14797deb 100644 --- a/src/AFQMC/Numerics/detail/CUDA/blas_cuda_gpu_ptr.hpp +++ b/src/AFQMC/Numerics/detail/CUDA/blas_cuda_gpu_ptr.hpp @@ -99,15 +99,15 @@ namespace qmc_cuda } // GEMV Specializations - template + template inline static void gemv(char Atrans, int M, int N, - T alpha, + T2 alpha, cuda_gpu_ptr A, int lda, cuda_gpu_ptr x, int incx, - T beta, + T2 beta, cuda_gpu_ptr y, int incy) { - static_assert(std::is_same::type,T>::value,"Wrong dispatch.\n"); + static_assert(std::is_same::type,T2>::value,"Wrong dispatch.\n"); static_assert(std::is_same::type,T>::value,"Wrong dispatch.\n"); if(CUBLAS_STATUS_SUCCESS != cublas::cublas_gemv(*A.handles.cublas_handle,Atrans, M,N,alpha,to_address(A),lda,to_address(x),incx, @@ -117,16 +117,16 @@ namespace qmc_cuda // GEMM Specializations // why is this not working with T const???? - template + template inline static void gemm(char Atrans, char Btrans, int M, int N, int K, - T alpha, + T2 alpha, cuda_gpu_ptr A, int lda, cuda_gpu_ptr B, int ldb, - T beta, + T2 beta, cuda_gpu_ptr C, int ldc) { static_assert(std::is_same::type,T>::value,"Wrong dispatch.\n"); - static_assert(std::is_same::type,T>::value,"Wrong dispatch.\n"); + static_assert(std::is_same::type,T2>::value,"Wrong dispatch.\n"); if(CUBLAS_STATUS_SUCCESS != cublas::cublas_gemm(*A.handles.cublas_handle,Atrans,Btrans, M,N,K,alpha,to_address(A),lda,to_address(B),ldb,beta,to_address(C),ldc)) throw std::runtime_error("Error: cublas_gemm returned error code."); diff --git a/src/AFQMC/Numerics/detail/CUDA/cublas_wrapper.hpp b/src/AFQMC/Numerics/detail/CUDA/cublas_wrapper.hpp index 57ea60c40e..27f88406ba 100644 --- a/src/AFQMC/Numerics/detail/CUDA/cublas_wrapper.hpp +++ b/src/AFQMC/Numerics/detail/CUDA/cublas_wrapper.hpp @@ -314,6 +314,60 @@ namespace cublas { return sucess; } + inline cublasStatus_t cublas_gemv(cublasHandle_t handle, + char Atrans, int M, int N, + const float alpha, + const float * A, int lda, + const std::complex * x, int incx, + const float beta, + std::complex *y, int incy) + { + cublasStatus_t sucess; + char Nt('N'); + char Tt('T'); + if(Atrans == 'n' || Atrans == 'N') + sucess = cublasSgemm(handle,cublasOperation(Nt),cublasOperation(Tt),2,M,N,&alpha, + reinterpret_cast(x),2*incx, + A,lda,&beta, + reinterpret_cast(y),2*incy); + else if(Atrans == 't' || Atrans == 'T') + sucess = cublasSgemm(handle,cublasOperation(Nt),cublasOperation(Nt),2,N,M,&alpha, + reinterpret_cast(x),2*incx, + A,lda,&beta, + reinterpret_cast(y),2*incy); + else + assert(0); + cudaDeviceSynchronize (); + return sucess; + } + + inline cublasStatus_t cublas_gemv(cublasHandle_t handle, + char Atrans, int M, int N, + const double alpha, + const double * A, int lda, + const std::complex * x, int incx, + const double beta, + std::complex * y, int incy) + { + cublasStatus_t sucess; + char Nt('N'); + char Tt('T'); + if(Atrans == 'n' || Atrans == 'N') + sucess = cublasDgemm(handle,cublasOperation(Nt),cublasOperation(Tt),2,M,N,&alpha, + reinterpret_cast(x),2*incx, + A,lda,&beta, + reinterpret_cast(y),2*incy); + else if(Atrans == 't' || Atrans == 'T') + sucess = cublasDgemm(handle,cublasOperation(Nt),cublasOperation(Nt),2,N,M,&alpha, + reinterpret_cast(x),2*incx, + A,lda,&beta, + reinterpret_cast(y),2*incy); + else + assert(0); + cudaDeviceSynchronize (); + return sucess; + } + // Level-3 inline cublasStatus_t cublas_gemm(cublasHandle_t handle, @@ -388,6 +442,44 @@ namespace cublas { return sucess; } + inline cublasStatus_t cublas_gemm(cublasHandle_t handle, + char Atrans, char Btrans, int M, int N, int K, + const float alpha, + const std::complex * A, int lda, + const float * B, int ldb, + const float beta, + std::complex * C, int ldc) + { + assert(Atrans=='n' || Atrans=='N'); + cublasStatus_t sucess = + cublasSgemm(handle, + cublasOperation(Atrans),cublasOperation(Btrans),2*M,N,K,&alpha, + reinterpret_cast(A),2*lda, + B,ldb,&beta, + reinterpret_cast(C),2*ldc); + cudaDeviceSynchronize (); + return sucess; + } + + inline cublasStatus_t cublas_gemm(cublasHandle_t handle, + char Atrans, char Btrans, int M, int N, int K, + double alpha, + const std::complex * A, int lda, + double * B, int ldb, + double beta, + std::complex * C, int ldc) + { + assert(Atrans=='n' || Atrans=='N'); + cublasStatus_t sucess = + cublasDgemm(handle, + cublasOperation(Atrans),cublasOperation(Btrans),2*M,N,K,&alpha, + reinterpret_cast(A),2*lda, + B,ldb,&beta, + reinterpret_cast(C),2*ldc); + cudaDeviceSynchronize (); + return sucess; + } + inline cublasStatus_t cublas_gemm(cublasHandle_t handle, char Atrans, char Btrans, int M, int N, int K, const cuDoubleComplex alpha, diff --git a/src/AFQMC/Numerics/helpers/batched_operations.hpp b/src/AFQMC/Numerics/helpers/batched_operations.hpp index c97ccbcb3c..8c2c5c9997 100644 --- a/src/AFQMC/Numerics/helpers/batched_operations.hpp +++ b/src/AFQMC/Numerics/helpers/batched_operations.hpp @@ -19,7 +19,9 @@ #if defined(ENABLE_CUDA) #include "AFQMC/Memory/CUDA/cuda_gpu_pointer.hpp" #include "AFQMC/Numerics/detail/CUDA/Kernels/batched_dot_wabn_wban.cuh" +#include "AFQMC/Numerics/detail/CUDA/Kernels/dot_wabn.cuh" #include "AFQMC/Numerics/detail/CUDA/Kernels/batched_Tab_to_Klr.cuh" +#include "AFQMC/Numerics/detail/CUDA/Kernels/Tab_to_Kl.cuh" #include "AFQMC/Numerics/detail/CUDA/Kernels/vbias_from_v1.cuh" #endif @@ -47,6 +49,60 @@ void batched_dot_wabn_wban( int nbatch, int nwalk, int nocc, int nchol, } } +template +void batched_dot_wanb_wbna( int nbatch, int nwalk, int nocc, int nchol, + std::complex const* alpha, std::complex const* Tab, + std::complex* y, int incy) +{ + int nocc2nc = nocc*nocc*nchol; + for(int batch=0; batch E_(0.0); + auto A_(Tab + (2*batch*nwalk+w)*nocc2nc); + auto B_(Tab + ((2*batch+1)*nwalk+w)*nocc2nc); + using ma::dot; + for(int a=0; a>(alpha[batch]*E_); + } + } +} + +template +void dot_wabn( int nwalk, int nocc, int nchol, + std::complex alpha, std::complex const* Tab, + std::complex* y, int incy) +{ + int nocc2nc = nocc*nocc*nchol; + for(int w=0; w E_(0.0); + auto A_(Tab + w*nocc2nc); + using ma::dot; + for(int a=0; a>(alpha*E_); + } +} + +template +void dot_wanb( int nwalk, int nocc, int nchol, + std::complex alpha, std::complex const* Tab, + std::complex* y, int incy) +{ + int nocc2nc = nocc*nchol*nocc; + for(int w=0; w E_(0.0); + auto A_(Tab + w*nocc2nc); + using ma::dot; + for(int a=0; a>(alpha*E_); + } +} + template void batched_Tab_to_Klr(int nterms, int nwalk, int nocc, int nchol_max, @@ -77,6 +133,61 @@ void batched_Tab_to_Klr(int nterms, int nwalk, int nocc, int nchol_max, } } +template +void batched_Tanb_to_Klr(int nterms, int nwalk, int nocc, int nchol_max, + int nchol_tot, int ncholQ, int ncholQ0, int* kdiag, + Q const* Tab, T* Kl, T* Kr) +{ + for(int w=0; w(Tba_[c*nocc]); + } + } + for( int k=0; k(Tab_[c*nocc]); + } + } + } +} + +template +void Tab_to_Kl(int nwalk, int nocc, int nchol, Q const* Tab, T* Kl) +{ + for(int w=0; w(Tab_[c]); + } + } +} + +template +void Tanb_to_Kl(int nwalk, int nocc, int nchol, int nchol_tot, Q const* Tab, T* Kl) +{ + for(int w=0; w(Tab_[c*nocc]); + } + } +} + template void vbias_from_v1( int nwalk, int nkpts, int nchol_max, int* Qsym, int* kminus, int* ncholpQ, int* ncholpQ0, std::complex const alpha, @@ -205,6 +316,31 @@ void batched_Tab_to_Klr(int nterms, int nwalk, int nocc, int nchol_max, to_address(Kl), to_address(Kr)); } +template +void batched_Tanb_to_Klr(int nterms, int nwalk, int nocc, int nchol_max, + int nchol_tot, int ncholQ, int ncholQ0, cuda_gpu_ptr kdiag, + cuda_gpu_ptr Tab, cuda_gpu_ptr Kl, + cuda_gpu_ptr Kr) +{ + kernels::batched_Tanb_to_Klr(nterms,nwalk,nocc,nchol_max,nchol_tot,ncholQ,ncholQ0, + to_address(kdiag), to_address(Tab), + to_address(Kl), to_address(Kr)); +} + +template +void Tab_to_Kl(int nwalk, int nocc, int nchol, + cuda_gpu_ptr Tab, cuda_gpu_ptr Kl) +{ + kernels::Tab_to_Kl(nwalk,nocc,nchol,to_address(Tab),to_address(Kl)); +} + +template +void Tanb_to_Kl(int nwalk, int nocc, int nchol, int nchol_tot, + cuda_gpu_ptr Tab, cuda_gpu_ptr Kl) +{ + kernels::Tanb_to_Kl(nwalk,nocc,nchol,nchol_tot,to_address(Tab),to_address(Kl)); +} + template void batched_dot_wabn_wban( int nbatch, int nwalk, int nocc, int nchol, cuda_gpu_ptr alpha, cuda_gpu_ptr Tab, @@ -212,7 +348,29 @@ void batched_dot_wabn_wban( int nbatch, int nwalk, int nocc, int nchol, { kernels::batched_dot_wabn_wban(nbatch,nwalk,nocc,nchol,to_address(alpha),to_address(Tab), y,incy); +} +template +void batched_dot_wanb_wbna( int nbatch, int nwalk, int nocc, int nchol, + cuda_gpu_ptr alpha, cuda_gpu_ptr Tab, + T* y , int incy) +{ + kernels::batched_dot_wanb_wbna(nbatch,nwalk,nocc,nchol,to_address(alpha),to_address(Tab), + y,incy); +} + +template +void dot_wabn( int nwalk, int nocc, int nchol, R alpha, cuda_gpu_ptr Tab, + T* y , int incy) +{ + kernels::dot_wabn(nwalk,nocc,nchol,alpha,to_address(Tab),y,incy); +} + +template +void dot_wanb( int nwalk, int nocc, int nchol, R alpha, cuda_gpu_ptr Tab, + T* y , int incy) +{ + kernels::dot_wanb(nwalk,nocc,nchol,alpha,to_address(Tab),y,incy); } template diff --git a/src/AFQMC/Propagators/AFQMCDistributedPropagatorDistCV.icc b/src/AFQMC/Propagators/AFQMCDistributedPropagatorDistCV.icc index 680276c58e..628eb08806 100644 --- a/src/AFQMC/Propagators/AFQMCDistributedPropagatorDistCV.icc +++ b/src/AFQMC/Propagators/AFQMCDistributedPropagatorDistCV.icc @@ -444,13 +444,16 @@ void AFQMCDistributedPropagatorDistCV::step_collective(int nsteps_, WlkSet& wset int vak0,vakN; int Gak0,GakN; +// NOTE: keep vHS, vbias and G as single precision numbers +// in mixe precision case, no need to keep them here in double precision. +// cast it using inplace_cast to/from double precision when needed. size_t displ=0; // HS potential for communication CMatrix_ref vrecv(mem_pool.origin(), {vhs_nr,vhs_nc}); displ+=vrecv.num_elements(); // second view of vHS matrix for use in propagation step // notice that the final vHS matrix will be in vrecv, received in the last step - C3Tensor_ref vHS3D(mem_pool.origin()+displ,{vhs3d_n1,vhs3d_n2,vhs3d_n3}); + C3Tensor_ref vHS3D(vrecv.origin(),{vhs3d_n1,vhs3d_n2,vhs3d_n3}); // scope controlling lifetime of temporary arrays { @@ -540,25 +543,16 @@ void AFQMCDistributedPropagatorDistCV::step_collective(int nsteps_, WlkSet& wset // 2. bcast G AFQMCTimers[vHS_comm_overhead_timer]->start(); #ifdef BUILD_AFQMC_WITH_NCCL - if(k == TG.getLocalTGRank()) { + if(k == TG.getLocalTGRank()) copy_n(Gstore.origin()+Gak0,GakN-Gak0,Gwork.origin()+Gak0); - NCCLCHECK(ncclBcast(to_address(Gwork.origin())+Gak0,GakN-Gak0,ncclDouble,k, - TG.ncclTG(),TG.ncclStream())); - qmc_cuda::cuda_check(cudaStreamSynchronize(TG.ncclStream()),"cudaStreamSynchronize(s)"); - //TG.TG_Cores().broadcast_n(to_address(Gwork.origin())+Gak0,GakN-Gak0,k); - } else { - NCCLCHECK(ncclBcast(to_address(Gwork.origin())+Gak0,GakN-Gak0,ncclDouble,k, - TG.ncclTG(),TG.ncclStream())); - qmc_cuda::cuda_check(cudaStreamSynchronize(TG.ncclStream()),"cudaStreamSynchronize(s)"); - //TG.TG_Cores().broadcast_n(to_address(Gwork.origin())+Gak0,GakN-Gak0,k); - } + NCCLCHECK(ncclBcast(to_address(Gwork.origin())+Gak0,2*(GakN-Gak0),ncclDouble,k, + TG.ncclTG(),TG.ncclStream())); + qmc_cuda::cuda_check(cudaStreamSynchronize(TG.ncclStream()),"cudaStreamSynchronize(s)"); + //TG.TG_Cores().broadcast_n(to_address(Gwork.origin())+Gak0,GakN-Gak0,k); #else - if(k == TG.getLocalTGRank()) { + if(k == TG.getLocalTGRank()) copy_n(Gstore.origin()+Gak0,GakN-Gak0,Gwork.origin()+Gak0); - TG.TG_Cores().broadcast_n(to_address(Gwork.origin())+Gak0,GakN-Gak0,k); - } else { - TG.TG_Cores().broadcast_n(to_address(Gwork.origin())+Gak0,GakN-Gak0,k); - } + TG.TG_Cores().broadcast_n(to_address(Gwork.origin())+Gak0,GakN-Gak0,k); #endif TG.local_barrier(); AFQMCTimers[vHS_comm_overhead_timer]->stop(); @@ -598,11 +592,9 @@ void AFQMCDistributedPropagatorDistCV::step_collective(int nsteps_, WlkSet& wset AFQMCTimers[vHS_comm_overhead_timer]->start(); // 4. Reduce vHS #ifdef BUILD_AFQMC_WITH_NCCL -// If BUILD_AFQMC_WITH_NCCL, ENABLE_CUDA must be set, which guarantees ncores==1 -// APP_ABORT("FINISH \n\n\n"); - NCCLCHECK(ncclAllReduce((const void*)to_address(MFfactor.origin()), - (void*)to_address(MFfactor.origin()), 2*MFfactor.num_elements(), - ncclDouble, ncclSum,TG.ncclTG(),TG.ncclStream())); + NCCLCHECK(ncclReduce((const void*)to_address(vHS.origin())+vak0, + (void*)to_address(vrecv.origin())+vak0, 2*(vakN-vak0), + ncclDouble, ncclSum,k,TG.ncclTG(),TG.ncclStream())); qmc_cuda::cuda_check(cudaStreamSynchronize(TG.ncclStream()),"cudaStreamSynchronize(s)"); #else TG.TG_Cores().reduce_n(to_address(vHS.origin())+vak0,vakN-vak0, diff --git a/src/AFQMC/Utilities/readWfn.cpp b/src/AFQMC/Utilities/readWfn.cpp index fe1495cfc4..5b2e2cc82a 100644 --- a/src/AFQMC/Utilities/readWfn.cpp +++ b/src/AFQMC/Utilities/readWfn.cpp @@ -605,7 +605,10 @@ void read_ph_wavefunction_hdf(hdf_archive& dump, std::vector& ci_co * NOTE: Does not mean perfect pairing, means excitations from a single reference * - 1: excitations out of a UHF reference */ - getCommonInput(dump, NMO, NAEA, NAEB, ndets, ci_coeff, walker_type, comm.root()); + WALKER_TYPES wtype; + getCommonInput(dump, NMO, NAEA, NAEB, ndets, ci_coeff, wtype, comm.root()); + if(walker_type != wtype && NAEA != NAEB) + APP_ABORT(" Error: When Different walker_type between hdf5 and xml inputs when NAEA!=NAEB (not allowed). \n"); if(walker_type != COLLINEAR) APP_ABORT(" Error: walker_type!=COLLINEAR not yet implemented in read_ph_wavefunction.\n"); @@ -632,10 +635,17 @@ void read_ph_wavefunction_hdf(hdf_archive& dump, std::vector& ci_co PsiT.emplace_back(csr_hdf5::HDF2CSR(dump,comm)); dump.pop(); if(wfn_type == 1) { - if(!dump.push(std::string("PsiT_")+std::to_string(1),false)) { - app_error()<<" Error in WavefunctionFactory: Group PsiT not found. \n"; - APP_ABORT(""); - } + if(wtype == CLOSED) { + if(!dump.push(std::string("PsiT_")+std::to_string(0),false)) { + app_error()<<" Error in WavefunctionFactory: Group PsiT not found. \n"; + APP_ABORT(""); + } + } else if(wtype == COLLINEAR) { + if(!dump.push(std::string("PsiT_")+std::to_string(1),false)) { + app_error()<<" Error in WavefunctionFactory: Group PsiT not found. \n"; + APP_ABORT(""); + } + } PsiT.emplace_back(csr_hdf5::HDF2CSR(dump,comm)); dump.pop(); } @@ -768,7 +778,7 @@ ph_excitations build_ph_struct(std::vector ci_coef * Read trial wavefunction information from file. */ void getCommonInput(hdf_archive& dump, int NMO, int NAEA, int NAEB, int& ndets_to_read, - std::vector& ci, WALKER_TYPES walker_type, bool root) + std::vector& ci, WALKER_TYPES& walker_type, bool root) { // check for consistency in parameters std::vector dims(5); @@ -788,10 +798,12 @@ void getCommonInput(hdf_archive& dump, int NMO, int NAEA, int NAEB, int& ndets_t app_error()<<" Error in getCommonInput(): Inconsistent NAEB. \n"; APP_ABORT(""); } - if(walker_type != dims[3]) { - app_error()<<" Error in getCommonInput(): Inconsistent walker_type. \n"; - APP_ABORT(""); - } + walker_type = afqmc::initWALKER_TYPES(dims[3]); + // just read walker_type, to allow flexibility + //if(walker_type != dims[3]) { + // app_error()<<" Error in getCommonInput(): Inconsistent walker_type. \n"; + // APP_ABORT(""); + //} if(ndets_to_read < 1) ndets_to_read = dims[4]; app_log() << " - Number of determinants in trial wavefunction: " << ndets_to_read << "\n"; if(ndets_to_read > dims[4]) { diff --git a/src/AFQMC/Utilities/readWfn.h b/src/AFQMC/Utilities/readWfn.h index a7585f080e..17d126f0fc 100644 --- a/src/AFQMC/Utilities/readWfn.h +++ b/src/AFQMC/Utilities/readWfn.h @@ -44,7 +44,7 @@ ph_excitations build_ph_struct(std::vector ci_coef boost::mpi3::shared_communicator& comm, int NMO, int NAEA, int NAEB); void getCommonInput(hdf_archive& dump, int NMO, int NAEA, int NAEB, int& ndets_to_read, - std::vector& ci, WALKER_TYPES walker_type, bool root); + std::vector& ci, WALKER_TYPES& walker_type, bool root); WALKER_TYPES getWalkerType(std::string filename); WALKER_TYPES getWalkerTypeHDF5(std::string filename, std::string type); diff --git a/src/AFQMC/Wavefunctions/WavefunctionFactory.cpp b/src/AFQMC/Wavefunctions/WavefunctionFactory.cpp index a0f8a29ff1..9e044209df 100644 --- a/src/AFQMC/Wavefunctions/WavefunctionFactory.cpp +++ b/src/AFQMC/Wavefunctions/WavefunctionFactory.cpp @@ -664,8 +664,9 @@ Wavefunction WavefunctionFactory::fromHDF5(TaskGroup_& TGprop, TaskGroup_& TGwfn std::vector ci; // Read common trial wavefunction input options. + WALKER_TYPES input_wtype; getCommonInput(dump, NMO, NAEA, NAEB, ndets_to_read, ci, - walker_type, TGwfn.Global().root()); + input_wtype, TGwfn.Global().root()); if(restart_file == "") { NCE = h.getNuclearCoulombEnergy(); } else { @@ -693,6 +694,19 @@ Wavefunction WavefunctionFactory::fromHDF5(TaskGroup_& TGprop, TaskGroup_& TGwfn PsiT.emplace_back(csr_hdf5::HDF2CSR(dump,TGwfn.Node())); //,Alloc(TGwfn.Node()))); dump.pop(); } + if( walker_type==COLLINEAR and input_wtype==CLOSED) { + if(NAEA!=NAEB) + APP_ABORT(" Error: NAEA!=NAEB when initializing collinear wfn from closed shell file.\n"); + // read them again + for(int i=0; i(dump,TGwfn.Node())); //,Alloc(TGwfn.Node()))); + dump.pop(); + } + } // Set initial walker's Slater matrix. getInitialGuess(dump, name, NMO, NAEA, NAEB, walker_type); @@ -917,6 +931,12 @@ Wavefunction WavefunctionFactory::fromHDF5(TaskGroup_& TGprop, TaskGroup_& TGwfn */ void WavefunctionFactory::getInitialGuess(hdf_archive& dump, std::string& name, int NMO, int NAEA, int NAEB, WALKER_TYPES walker_type) { + std::vector dims(5); + if(!dump.readEntry(dims,"dims")) { + app_error()<<" Error in getCommonInput(): Problems reading dims. \n"; + APP_ABORT(""); + } + WALKER_TYPES wtype(initWALKER_TYPES(dims[3])); auto guess = initial_guess.find(name); if( guess == initial_guess.end() ) { auto newg = initial_guess.insert( @@ -936,14 +956,27 @@ void WavefunctionFactory::getInitialGuess(hdf_archive& dump, std::string& name, ((newg.first)->second)[0][i][j] = Psi0Alpha[i][j]; } if(walker_type==COLLINEAR) { - boost::multi::array Psi0Beta({NMO,NAEB}); - if(!dump.readEntry(Psi0Beta, "Psi0_beta")) { - app_error()<<" Error in WavefunctionFactory: Initial wavefunction Psi0_beta not found. \n"; - APP_ABORT(""); - } - for(int i = 0; i < NMO; i++) - for(int j = 0; j < NAEB; j++) - ((newg.first)->second)[1][i][j] = Psi0Beta[i][j]; + if(wtype == COLLINEAR) { + boost::multi::array Psi0Beta({NMO,NAEB}); + if(!dump.readEntry(Psi0Beta, "Psi0_beta")) { + app_error()<<" Error in WavefunctionFactory: Initial wavefunction Psi0_beta not found. \n"; + APP_ABORT(""); + } + for(int i = 0; i < NMO; i++) + for(int j = 0; j < NAEB; j++) + ((newg.first)->second)[1][i][j] = Psi0Beta[i][j]; + } else if(wtype == CLOSED) { + boost::multi::array Psi0Beta({NMO,NAEA}); + assert(NAEA==NAEB); + if(!dump.readEntry(Psi0Beta, "Psi0_alpha")) { + app_error()<<" Error in WavefunctionFactory: Initial wavefunction Psi0_beta not found. \n"; + APP_ABORT(""); + } + for(int i = 0; i < NMO; i++) + for(int j = 0; j < NAEB; j++) + ((newg.first)->second)[1][i][j] = Psi0Beta[i][j]; + } else + APP_ABORT(" Error: Unknown wtype. \n"); } } else APP_ABORT(" Error: Problems adding new initial guess, already exists. \n"); diff --git a/src/AFQMC/config.h b/src/AFQMC/config.h index e35598c8f3..4b2bb792b2 100644 --- a/src/AFQMC/config.h +++ b/src/AFQMC/config.h @@ -71,6 +71,14 @@ namespace afqmc // when ENABLE_CUDA is not set, DEVICE and TG_LOCAL are the same enum ALLOCATOR_TYPES {STD,NODE,STD_DEVICE,SHARED_LOCAL_DEVICE,SHARED_DEVICE}; + inline WALKER_TYPES initWALKER_TYPES(int i) { + if(i==0) return UNDEFINED_WALKER_TYPE; + else if(i==1) return CLOSED; + else if(i==2) return COLLINEAR; + else if(i==3) return NONCOLLINEAR; + return UNDEFINED_WALKER_TYPE; + } + template using s1D = std::tuple; template using s2D = std::tuple; template using s3D = std::tuple; @@ -192,7 +200,7 @@ namespace afqmc ma::sparse::is_root>; #endif - enum HamiltonianTypes {Factorized,THC,KPTHC,KPFactorized,UNKNOWN}; + enum HamiltonianTypes {Factorized,THC,KPTHC,KPFactorized,RealDenseFactorized,UNKNOWN}; template using iextensions = typename boost::multi::iextensions;