QMCPACK · mmorale3 · Nov 14, 2019 · Sep 16, 2019 · Sep 16, 2019 · Sep 16, 2019
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -227,6 +227,7 @@ IF(MIXED_PRECISION AND BUILD_LMYENGINE_INTERFACE)
   SET(BUILD_LMYENGINE_INTERFACE 0)
 ENDIF()
 SET(BUILD_AFQMC 0 CACHE BOOL "Build with AFQMC")
+SET(BUILD_AFQMC_WITH_NCCL 0 CACHE BOOL "Build AFQMC with NCCL library.")
 # AFQMC requires MPI.
 If (BUILD_AFQMC AND NOT QMC_MPI)
   MESSAGE(FATAL_ERROR "AFQMC requires building with MPI (QMC_MPI=1). Set BUILD_AFQMC=0 or configure MPI.")

diff --git a/external_codes/mpi_wrapper/mpi3/shared_window.hpp b/external_codes/mpi_wrapper/mpi3/shared_window.hpp
@@ -21,8 +21,8 @@ namespace mpi3{
 template<class T>
 struct shared_window : window<T>{
 //	shared_communicator& comm_;
-	shared_window(shared_communicator& comm, mpi3::size_t n, int disp_unit = alignof(T)) : //sizeof(T)) : // here we assume that disp_unit is used for align
-		window<T>{}//, comm_{comm}
+	shared_window(shared_communicator& comm, mpi3::size_t n, int disp_unit = alignof(T)) //: //sizeof(T)) : // here we assume that disp_unit is used for align
+		//window<T>()//, comm_{comm}
 	{
 		void* base_ptr = nullptr;
 		auto e = static_cast<enum error>(

diff --git a/src/AFQMC/Estimators/BackPropagatedEstimator.hpp b/src/AFQMC/Estimators/BackPropagatedEstimator.hpp
@@ -140,13 +140,15 @@ class BackPropagatedEstimator: public EstimatorBase
     if(iav < 0) return;
 
     using std::fill_n;
-    // 0. skip if requested 
-    if(bp_step == max_nback_prop && iblock < nblocks_skip) {
-      if( iblock+1 == nblocks_skip )
-        for(auto it=wset.begin(); it<wset.end(); ++it)
-          it->setSlaterMatrixN(); 
-      iblock++;
-      wset.setBPPos(0);
+    // 0. skip if requested  
+    if(iblock < nblocks_skip) {
+      if(bp_step == max_nback_prop) {
+        if( iblock+1 == nblocks_skip )
+          for(auto it=wset.begin(); it<wset.end(); ++it)
+            it->setSlaterMatrixN(); 
+        iblock++;
+        wset.setBPPos(0);
+      }  
       return;
     }
 

diff --git a/src/AFQMC/Estimators/EstimatorHandler.h b/src/AFQMC/Estimators/EstimatorHandler.h
@@ -61,6 +61,11 @@ class EstimatorHandler: public AFQMCInfo
   {
     estimators.reserve(10);
 
+    app_log()<<"\n****************************************************\n"
+           <<"               Initializing Estimators \n"
+           <<"****************************************************\n"
+           <<std::endl;
+
     std::string overwrite_default_energy("no");
     xmlNodePtr curRoot = cur;
     xmlNodePtr curBasic = NULL;

diff --git a/src/AFQMC/Estimators/FullObsHandler.hpp b/src/AFQMC/Estimators/FullObsHandler.hpp
@@ -62,6 +62,8 @@ class FullObsHandler: public AFQMCInfo
   using sharedCMatrix_ref = boost::multi::array_ref<ComplexType,2,shared_pointer>;
   using sharedC4Tensor_ref = boost::multi::array_ref<ComplexType,4,shared_pointer>;
 
+  using mpi3C4Tensor = boost::multi::array<ComplexType,4,shared_allocator<ComplexType>>;
+
   using stdCVector = boost::multi::array<ComplexType,1>;
   using stdCMatrix = boost::multi::array<ComplexType,2>;
   using stdCVector_ref = boost::multi::array_ref<ComplexType,1>;
@@ -74,7 +76,8 @@ class FullObsHandler: public AFQMCInfo
                                     AFQMCInfo(info),TG(tg_),walker_type(wlk),
                                     wfn0(wfn), writer(false), block_size(1), nave(1),name(name_),
                                     nspins((walker_type==COLLINEAR)?2:1),
-                                    Buff(iextensions<1u>{1},make_localTG_allocator<ComplexType>(TG))
+                                    Buff(iextensions<1u>{1},make_localTG_allocator<ComplexType>(TG)), 
+                                    G4D_host({0,0,0,0},shared_allocator<ComplexType>{TG.TG_local()})
   {
 
     using std::fill_n;
@@ -93,8 +96,29 @@ class FullObsHandler: public AFQMCInfo
     cur = curRoot->children;
     while (cur != NULL) {
       std::string cname((const char*)(cur->name));
-      if(cname =="OneRDM") {
-        properties.emplace_back(Observable(std::move(full1rdm(TG,info,cur,walker_type,nave,block_size))));
+      std::transform(cname.begin(),cname.end(),cname.begin(),(int (*)(int)) tolower);
+      if(cname =="onerdm") {
+        properties.emplace_back(Observable(std::move(full1rdm(TG,info,cur,walker_type,nave,block_size)))); 
+      } else if(cname =="diag2rdm") {
+        properties.emplace_back(Observable(std::move(diagonal2rdm(TG,info,cur,walker_type,nave,block_size)))); 
+      } else if(cname =="n2r" || cname =="ontop2rdm") {
+#if defined(ENABLE_CUDA)
+        std::string str("false");
+        ParameterSet m_param;
+        m_param.add(str, "use_host_memory", "std::string");
+        m_param.put(cur);
+        std::transform(str.begin(),str.end(),str.begin(),(int (*)(int)) tolower);
+        if(str == "false" || str == "no") { 
+          properties.emplace_back(Observable(std::move(n2r<device_allocator<ComplexType>>(
+                  TG,info,cur,walker_type,false,device_allocator<ComplexType>{},
+                  device_allocator<ComplexType>{},nave,block_size)))); 
+        } else 
+#endif
+        {
+          properties.emplace_back(Observable(std::move(n2r<shared_allocator<ComplexType>>(
+                  TG,info,cur,walker_type,true,shared_allocator<ComplexType>{TG.TG_local()},
+                  shared_allocator<ComplexType>{TG.Node()},nave,block_size)))); 
+        }
       }
       cur = cur->next;
     }
@@ -141,6 +165,12 @@ class FullObsHandler: public AFQMCInfo
     sharedCMatrix_ref G2D(Buff.origin(), {nw, dm_size});
     sharedCVector_ref DevOv(G4D.origin()+G4D.num_elements(), {2*nw});
 
+    if(G4D_host.num_elements() != G4D.num_elements()) {
+      G4D_host = std::move(mpi3C4Tensor(G4D.extensions(),
+                                      shared_allocator<ComplexType>{TG.TG_local()}));
+      TG.TG_local().barrier();
+    }
+
     stdCVector Xw(iextensions<1u>{nw});
     stdCVector Ov(iextensions<1u>{2*nw});
     stdCMatrix detR(DevdetR); 
@@ -221,8 +251,17 @@ class FullObsHandler: public AFQMCInfo
           Xw[iw] = CIcoeff * Ov[iw] * detR[iw][iref]; 
       } 
 
+      // MAM: Since most of the simpler estimators need G4D in host memory, 
+      //      I'm providing a copy of the structure there already
+      TG.TG_local().barrier();
+      int i0,iN;
+      std::tie(i0,iN) = FairDivideBoundary(TG.TG_local().rank(),int(G4D_host.num_elements()),
+                                         TG.TG_local().size());
+      copy_n( make_device_ptr(G4D.origin())+i0, iN-i0, to_address(G4D_host.origin())+i0);
+      TG.TG_local().barrier();
+
       //3. accumulate references 
-      for(auto& v: properties) v.accumulate_reference(iav,iref,G4D,wgt,Xw,Ov,impsamp);
+      for(auto& v: properties) v.accumulate_reference(iav,iref,G4D,G4D_host,wgt,Xw,Ov,impsamp);
 
     }
     //4. accumulate block (normalize and accumulate sum over references)
@@ -258,6 +297,9 @@ class FullObsHandler: public AFQMCInfo
   // buffer space
   sharedCVector Buff;
 
+  // space for G in host space
+  mpi3C4Tensor G4D_host; 
+
   void set_buffer(size_t N) {
     if(Buff.num_elements() < N)
       Buff = std::move(sharedCVector(iextensions<1u>{N},make_localTG_allocator<ComplexType>(TG)));

diff --git a/src/AFQMC/Estimators/Observables/Observable.hpp b/src/AFQMC/Estimators/Observables/Observable.hpp
@@ -20,6 +20,8 @@
 #include "boost/variant.hpp"
 
 #include "AFQMC/Estimators/Observables/full1rdm.hpp"
+#include "AFQMC/Estimators/Observables/diagonal2rdm.hpp"
+#include "AFQMC/Estimators/Observables/n2r.hpp"
 
 namespace qmcplusplus
 {
@@ -58,7 +60,12 @@ class dummy_obs
  * Variant class for observables. 
  * Defines a common interface for all observable classes.
  */
-class Observable: public boost::variant<dummy::dummy_obs,full1rdm> 
+class Observable: public boost::variant<dummy::dummy_obs,full1rdm,diagonal2rdm,
+                                        n2r<shared_allocator<ComplexType>> 
+#if defined(ENABLE_CUDA)
+                                        ,n2r<device_allocator<ComplexType>> 
+#endif
+                                        > 
                                         //,full2rdm,contract1rdm,contract2rdm>
 {
 
@@ -71,6 +78,17 @@ class Observable: public boost::variant<dummy::dummy_obs,full1rdm>
     explicit Observable(full1rdm && other) : variant(std::move(other)) {}
     explicit Observable(full1rdm const& other) = delete;
 
+    explicit Observable(diagonal2rdm && other) : variant(std::move(other)) {}
+    explicit Observable(diagonal2rdm const& other) = delete;
+
+    explicit Observable(n2r<shared_allocator<ComplexType>> && other) : variant(std::move(other)) {}
+    explicit Observable(n2r<shared_allocator<ComplexType>> const& other) = delete;
+
+#if defined(ENABLE_CUDA)
+    explicit Observable(n2r<device_allocator<ComplexType>> && other) : variant(std::move(other)) {}
+    explicit Observable(n2r<device_allocator<ComplexType>> const& other) = delete;
+#endif
+
 /*
     explicit Observable( && other) : variant(std::move(other)) {}
     explicit Observable( const& other) = delete;