From b318dd95293142ea9e2c52df6c40dfd684f90dca Mon Sep 17 00:00:00 2001
From: mmorale3 <miguel.mmorales@gmail.com>
Date: Sun, 6 Sep 2020 20:18:03 -0700
Subject: [PATCH 01/10] fixed issues with noncollinear in Walkers

---
 src/AFQMC/Walkers/WalkerIO.hpp              | 13 ++++-
 src/AFQMC/Walkers/WalkerSetBase.h           |  8 +--
 src/AFQMC/Walkers/WalkerSetBase.icc         |  5 ++
 src/AFQMC/Walkers/tests/test_sharedwset.cpp | 65 +++++++++++++--------
 4 files changed, 60 insertions(+), 31 deletions(-)
diff --git a/src/AFQMC/Walkers/WalkerIO.hpp b/src/AFQMC/Walkers/WalkerIO.hpp
index 1132e7229f..6e1e5a64ff 100644
--- a/src/AFQMC/Walkers/WalkerIO.hpp
+++ b/src/AFQMC/Walkers/WalkerIO.hpp
@@ -201,6 +201,8 @@ bool restartFromHDF5(WalkerSet& wset,
   }
   TG.TG_local().broadcast_n(Idata.begin(), Idata.size());
 
+  auto walker_type = wset.getWalkerType();
+
   int nWtot      = Idata[0];
   int wlk_nterms = Idata[2];
   int NMO        = Idata[4];
@@ -232,9 +234,10 @@ bool restartFromHDF5(WalkerSet& wset,
   int nw_local = nWN - nW0;
   { // to limit scope
     boost::multi::array<ComplexType, 2> PsiA, PsiB;
+    int NMO2 = ((walker_type == NONCOLLINEAR)?2*NMO:NMO);
     if (TG.TG_local().root())
     {
-      PsiA.reextent({NMO, NAEA});
+      PsiA.reextent({NMO2, NAEA});
       if (wset.getWalkerType() == COLLINEAR)
         PsiB.reextent({NMO, NAEB});
     }
@@ -327,6 +330,8 @@ bool dumpToHDF5(WalkerSet& wset, hdf_archive& dump)
       NAEA   = (*w.SlaterMatrix(Alpha)).size(1);
       if (walker_type == COLLINEAR)
         NAEB = (*w.SlaterMatrix(Beta)).size(1)
+      if (walker_type == NONCOLLINEAR) 
+        NMO /= 2;
     }
 
     std::vector<int> Idata(7);
@@ -377,8 +382,10 @@ bool dumpToHDF5(WalkerSet& wset, hdf_archive& dump)
         auto w = wset[0];
         NMO    = (*w.SlaterMatrix(Alpha)).size(0);
         NAEA   = (*w.SlaterMatrix(Alpha)).size(1);
-        if (walker_type == COLLINEAR)
-          NAEB = (*w.SlaterMatrix(Beta)).size(1);
+        if (walker_type == COLLINEAR) 
+          NAEB = (*w.SlaterMatrix(Beta)).size(1); 
+        if (walker_type == NONCOLLINEAR) 
+          NMO /= 2;
       }
 
       std::vector<int> Idata(7);
diff --git a/src/AFQMC/Walkers/WalkerSetBase.h b/src/AFQMC/Walkers/WalkerSetBase.h
index 7d36fc14f2..0f025a1df6 100644
--- a/src/AFQMC/Walkers/WalkerSetBase.h
+++ b/src/AFQMC/Walkers/WalkerSetBase.h
@@ -615,12 +615,10 @@ class WalkerSetBase : public AFQMCInfo
 
   int walkerSizeIO()
   {
-    if (walkerType == CLOSED)
-      return wlk_desc[0] * wlk_desc[1] + 7;
-    else if (walkerType == COLLINEAR)
+    if (walkerType == COLLINEAR)
       return wlk_desc[0] * (wlk_desc[1] + wlk_desc[2]) + 7;
-    else if (walkerType == NONCOLLINEAR)
-      return 2 * wlk_desc[0] * (wlk_desc[1] + wlk_desc[2]) + 7;
+    else  // since NAEB = 0 in both CLOSED and NONCOLLINEAR cases
+      return wlk_desc[0] * wlk_desc[1] + 7;
     return 0;
   }
 
diff --git a/src/AFQMC/Walkers/WalkerSetBase.icc b/src/AFQMC/Walkers/WalkerSetBase.icc
index 22c84007be..70bdff1e35 100644
--- a/src/AFQMC/Walkers/WalkerSetBase.icc
+++ b/src/AFQMC/Walkers/WalkerSetBase.icc
@@ -58,6 +58,11 @@ void WalkerSetBase<Alloc, Ptr>::parse(xmlNodePtr cur)
     app_log() << " Using a non-collinear (GHF) walker. \n";
     walkerType = NONCOLLINEAR;
   }
+  else if (type.find("noncollinear") != std::string::npos)
+  {
+    app_log() << " Using a non-collinear (GHF) walker. \n";
+    walkerType = NONCOLLINEAR;
+  }
   else if (type.find("collinear") != std::string::npos)
   {
     app_log() << " Using a collinear (UHF/ROHF) walker. \n";
diff --git a/src/AFQMC/Walkers/tests/test_sharedwset.cpp b/src/AFQMC/Walkers/tests/test_sharedwset.cpp
index 75f0e3d270..c82e77dd20 100644
--- a/src/AFQMC/Walkers/tests/test_sharedwset.cpp
+++ b/src/AFQMC/Walkers/tests/test_sharedwset.cpp
@@ -78,7 +78,7 @@ void check(M1&& A, M2& B)
 using namespace afqmc;
 using communicator = boost::mpi3::communicator;
 
-void test_basic_walker_features(bool serial)
+void test_basic_walker_features(bool serial, std::string wtype)
 {
   auto world = boost::mpi3::environment::get_world_instance();
   auto node  = world.split_shared(world.rank());
@@ -92,6 +92,7 @@ void test_basic_walker_features(bool serial)
   //assert(world.size()%2 == 0);
 
   int NMO = 8, NAEA = 2, NAEB = 2, nwalkers = 10;
+  if(wtype=="noncollinear") { NAEA = 4; NAEB = 0; } 
 
   //auto node = world.split_shared();
 
@@ -102,24 +103,24 @@ void test_basic_walker_features(bool serial)
   info.NAEA = NAEA;
   info.NAEB = NAEB;
   info.name = "walker";
-  boost::multi::array<Type, 2> initA({NMO, NAEA});
-  boost::multi::array<Type, 2> initB({NMO, NAEB});
-  for (int i = 0; i < NAEA; i++)
-    initA[i][i] = Type(0.22);
-  for (int i = 0; i < NAEB; i++)
-    initB[i][i] = Type(0.22);
+  int M( (wtype=="noncollinear")?2*NMO:NMO );
+  boost::multi::array<Type,2> initA({M,NAEA});
+  boost::multi::array<Type,2> initB({M,NAEB});
+  for(int i=0; i<NAEA; i++) initA[i][i] = Type(0.22);
+  for(int i=0; i<NAEB; i++) initB[i][i] = Type(0.22);
   RandomGenerator_t rng;
 
-  const char* xml_block = "<WalkerSet name=\"wset0\">  \
+  std::string xml_block;
+  xml_block = "<WalkerSet name=\"wset0\">  \
   <parameter name=\"min_weight\">0.05</parameter>  \
   <parameter name=\"max_weight\">4</parameter>  \
-  <parameter name=\"walker_type\">closed</parameter>  \
+  <parameter name=\"walker_type\">"+wtype+"</parameter>  \
   <parameter name=\"load_balance\">async</parameter>  \
   <parameter name=\"pop_control\">pair</parameter>  \
 </WalkerSet> \
 ";
   Libxml2Document doc;
-  bool okay = doc.parseFromString(xml_block);
+  bool okay = doc.parseFromString(xml_block.c_str());
   REQUIRE(okay);
 
   WalkerSet wset(TG, doc.getRoot(), info, &rng);
@@ -277,6 +278,9 @@ void test_hyperslab()
         REQUIRE(imag(DataIn[i][j]) == 0);
       }
   }
+  world.barrier();
+  if (world.root())
+    remove("dummy_walkers.h5");
 }
 
 void test_double_hyperslab()
@@ -354,9 +358,12 @@ void test_double_hyperslab()
 */
     }
   }
+  world.barrier();
+  if (world.root())
+    remove("dummy_walkers.h5");
 }
 
-void test_walker_io()
+void test_walker_io(std::string wtype)
 {
   auto world = boost::mpi3::environment::get_world_instance();
   auto node  = world.split_shared(world.rank());
@@ -370,6 +377,7 @@ void test_walker_io()
   //assert(world.size()%2 == 0);
 
   int NMO = 8, NAEA = 2, NAEB = 2, nwalkers = 10;
+  if(wtype=="noncollinear") { NAEA = 4; NAEB = 0; }
 
   //auto node = world.split_shared();
 
@@ -380,20 +388,20 @@ void test_walker_io()
   info.NAEA = NAEA;
   info.NAEB = NAEB;
   info.name = "walker";
-  boost::multi::array<Type, 2> initA({NMO, NAEA});
-  boost::multi::array<Type, 2> initB({NMO, NAEB});
-  for (int i = 0; i < NAEA; i++)
-    initA[i][i] = Type(1.0);
-  for (int i = 0; i < NAEB; i++)
-    initB[i][i] = Type(1.0);
+  int M( (wtype=="noncollinear")?2*NMO:NMO );
+  boost::multi::array<Type,2> initA({M,NAEA});
+  boost::multi::array<Type,2> initB({M,NAEB});
+  for(int i=0; i<NAEA; i++) initA[i][i] = Type(0.22);
+  for(int i=0; i<NAEB; i++) initB[i][i] = Type(0.22);
   RandomGenerator_t rng;
 
-  const char* xml_block = "<WalkerSet name=\"wset0\">  \
-  <parameter name=\"walker_type\">closed</parameter>  \
+  std::string xml_block;
+  xml_block = "<WalkerSet name=\"wset0\">  \
+  <parameter name=\"walker_type\">"+wtype+"</parameter>  \
 </WalkerSet> \
 ";
   Libxml2Document doc;
-  bool okay = doc.parseFromString(xml_block);
+  bool okay = doc.parseFromString(xml_block.c_str());
   REQUIRE(okay);
 
   WalkerSet wset(TG, doc.getRoot(), info, &rng);
@@ -467,12 +475,19 @@ void test_walker_io()
       REQUIRE(*wset[i].EJ() == *wset2[i].EJ());
     }
   }
+  world.barrier();
+  if (world.root())
+    remove("dummy_walkers.h5");
 }
 
 TEST_CASE("swset_test_serial", "[shared_wset]")
 {
-  test_basic_walker_features(true);
-  test_basic_walker_features(false);
+  test_basic_walker_features(true,"closed");
+  test_basic_walker_features(false,"closed");
+  test_basic_walker_features(true,"collinear");
+  test_basic_walker_features(false,"collinear");
+  test_basic_walker_features(true,"noncollinear");
+  test_basic_walker_features(false,"noncollinear");
 }
 /*
 TEST_CASE("hyperslab_tests", "[shared_wset]")
@@ -481,6 +496,10 @@ TEST_CASE("hyperslab_tests", "[shared_wset]")
   test_double_hyperslab();
 }
 */
-TEST_CASE("walker_io", "[shared_wset]") { test_walker_io(); }
+TEST_CASE("walker_io", "[shared_wset]") { 
+  test_walker_io("closed"); 
+  test_walker_io("collinear"); 
+  test_walker_io("noncollinear"); 
+}
 
 } // namespace qmcplusplus

From d94f74be89cbbb456b14ec9b7648d33b170ac7f0 Mon Sep 17 00:00:00 2001
From: mmorale3 <miguel.mmorales@gmail.com>
Date: Sun, 6 Sep 2020 21:14:19 -0700
Subject: [PATCH 02/10] changes to readWfn

---
 src/AFQMC/Utilities/readWfn.cpp | 48 ++++++++++++++++++++++-----------
 1 file changed, 32 insertions(+), 16 deletions(-)

diff --git a/src/AFQMC/Utilities/readWfn.cpp b/src/AFQMC/Utilities/readWfn.cpp
index 14428fcfc1..466406b732 100644
--- a/src/AFQMC/Utilities/readWfn.cpp
+++ b/src/AFQMC/Utilities/readWfn.cpp
@@ -73,8 +73,6 @@ void read_header(std::ifstream& in, std::string& type, int& wfn_type, bool& full
         }
         case 2: {
           app_log() << "Reading a GHF-type trial wave-function. \n";
-          app_error() << " GHF type not implemented. \n";
-          APP_ABORT(" GHF type not implemented in readWfn.. \n");
           break;
         }
         default: {
@@ -353,15 +351,34 @@ void read_general_wavefunction(std::ifstream& in,
             APP_ABORT(" Error: Expecting Determinant: # tag in wavefunction file. \n");
           read_mat(in, OrbMat, Cstyle, fullMOMat, NMO, NAEA);
         }
-        PsiT.emplace_back(csr::shm::construct_csr_matrix_single_input<PsiT_Matrix>(OrbMat, 1e-8, 'H', comm));
-        if (walker_type == COLLINEAR)
-          PsiT.emplace_back(
-              csr::shm::construct_csr_matrix_single_input<PsiT_Matrix>(OrbMat(OrbMat.extension(0), {0, NAEB}), 1e-8,
-                                                                       'H', comm));
+        if(walker_type==CLOSED) {
+          PsiT.emplace_back(csr::shm::construct_csr_matrix_single_input<PsiT_Matrix>(
+                                        OrbMat,1e-8,'H',comm));
+        } else if(walker_type==COLLINEAR) {
+          PsiT.emplace_back(csr::shm::construct_csr_matrix_single_input<PsiT_Matrix>(
+                                        OrbMat,1e-8,'H',comm));
+          PsiT.emplace_back(csr::shm::construct_csr_matrix_single_input<PsiT_Matrix>(
+                                        OrbMat(OrbMat.extension(0),{0,NAEB}),
+                                        1e-8,'H',comm));
+        } else if(walker_type==NONCOLLINEAR) {
+          APP_ABORT(" Error in readWfn: wfn_type==closed with walker_type=noncollinear.\n");
+/*
+          boost::multi::array<ComplexType,2> Mat({2*NMO,NAEA+NAEB});
+          if(comm.rank()==0) {
+            std::fill_n(Mat.origin(),2*NMO*(NAEA+NAEB),ComplexType(0.0));
+            Mat({0,NMO},{0,NAEA}) = OrbMat;
+            Mat({NMO,2*NMO},{NAEA,NAEA+NAEB}) = OrbMat({0,NMO},{0,NAEB});
+          }  
+          PsiT.emplace_back(csr::shm::construct_csr_matrix_single_input<PsiT_Matrix>(
+                                        OrbMat,1e-8,'H',comm));
+*/
+        }
       }
     }
     else if (wfn_type == 1)
     {
+      if(walker_type != COLLINEAR)
+        APP_ABORT(" Error in readWfn: wfn_type==collinear with walker_type!=collinear.\n");
       boost::multi::array<ComplexType, 2> OrbMat({NMO, NAEA});
       for (int i = 0, q = 0; i < ndets; i++)
       {
@@ -372,16 +389,21 @@ void read_general_wavefunction(std::ifstream& in,
             APP_ABORT(" Error: Expecting Determinant: # tag in wavefunction file. \n");
           read_mat(in, OrbMat, Cstyle, fullMOMat, NMO, NAEA);
         }
+        
         PsiT.emplace_back(csr::shm::construct_csr_matrix_single_input<PsiT_Matrix>(OrbMat, 1e-8, 'H', comm));
         if (comm.rank() == 0)
           read_mat(in, OrbMat(OrbMat.extension(0), {0, NAEB}), Cstyle, fullMOMat, NMO, NAEB);
         PsiT.emplace_back(
-            csr::shm::construct_csr_matrix_single_input<PsiT_Matrix>(OrbMat(OrbMat.extension(0), {0, NAEB}), 1e-8, 'H',
+          csr::shm::construct_csr_matrix_single_input<PsiT_Matrix>(OrbMat(OrbMat.extension(0), {0, NAEB}), 1e-8, 'H',
                                                                      comm));
       }
     }
     else if (wfn_type == 2)
     {
+      if(walker_type != NONCOLLINEAR)
+        APP_ABORT(" Error in readWfn: wfn_type==collinear with walker_type!=collinear.\n");
+      if(NAEB != 0)  
+        APP_ABORT(" Error in readWfn: walker_type==collinear with NAEB!=0.\n");
       boost::multi::array<ComplexType, 2> OrbMat({2 * NMO, NAEA});
       for (int i = 0, q = 0; i < ndets; i++)
       {
@@ -393,13 +415,7 @@ void read_general_wavefunction(std::ifstream& in,
           read_mat(in, OrbMat, Cstyle, fullMOMat, 2 * NMO, NAEA);
         }
         PsiT.emplace_back(csr::shm::construct_csr_matrix_single_input<PsiT_Matrix>(OrbMat, 1e-8, 'H', comm));
-        if (comm.rank() == 0)
-          read_mat(in, OrbMat(OrbMat.extension(0), {0, NAEB}), Cstyle, fullMOMat, NMO, NAEB);
-        PsiT.emplace_back(
-            csr::shm::construct_csr_matrix_single_input<PsiT_Matrix>(OrbMat(OrbMat.extension(0), {0, NAEB}), 1e-8, 'H',
-                                                                     comm));
       }
-
     } //type
   }
   else
@@ -1020,8 +1036,8 @@ int readWfn(std::string fileName, boost::multi::array<ComplexType, 3>& OrbMat, i
   }
   else if (wfn_type == 2)
   {
-    APP_ABORT(" Error: wfn_type == 2 not implemented. \n");
-
+    OrbMat.reextent({1,2*NMO,NAEA+NAEB});
+    read_mat(in,OrbMat[0],Cstyle,fullMOMat,2*NMO,NAEA+NAEB);
   } //type
 
   in.close();

From 5d8d446252a232090be35dd5c2c114e656d090d7 Mon Sep 17 00:00:00 2001
From: mmorale3 <miguel.mmorales@gmail.com>
Date: Thu, 10 Sep 2020 10:52:38 -0700
Subject: [PATCH 03/10] working commit, making changes to
 KPFactorizedHamiltonian, not yet done

---
 .../tests/test_hamiltonian_operations.cpp     | 30 ++++------
 src/AFQMC/Hamiltonians/HamiltonianFactory.cpp |  8 +++
 .../Hamiltonians/KPFactorizedHamiltonian.cpp  | 58 ++++++++++---------
 src/AFQMC/Utilities/kp_utilities.hpp          | 46 ++++++++++-----
 4 files changed, 81 insertions(+), 61 deletions(-)

diff --git a/src/AFQMC/HamiltonianOperations/tests/test_hamiltonian_operations.cpp b/src/AFQMC/HamiltonianOperations/tests/test_hamiltonian_operations.cpp
index d9be303656..554ab201f6 100644
--- a/src/AFQMC/HamiltonianOperations/tests/test_hamiltonian_operations.cpp
+++ b/src/AFQMC/HamiltonianOperations/tests/test_hamiltonian_operations.cpp
@@ -110,7 +110,6 @@ void ham_ops_basic_serial(boost::mpi3::communicator& world)
     Hamiltonian& ham = HamFac.getHamiltonian(gTG, ham_name);
 
     using CMatrix = ComplexMatrix<Alloc>;
-    boost::multi::array<ComplexType, 3> OrbMat({2, NMO, NAEA});
     hdf_archive dump;
     if (!dump.open(UTEST_WFN, H5F_ACC_RDONLY))
     {
@@ -118,7 +117,6 @@ void ham_ops_basic_serial(boost::mpi3::communicator& world)
     }
     dump.push("Wavefunction", false);
     dump.push("NOMSD", false);
-    //int walker_type = readWfn(std::string(UTEST_WFN),OrbMat,NMO,NAEA,NAEB);
     std::vector<int> dims(5);
     if (!dump.readEntry(dims, "dims"))
     {
@@ -127,8 +125,8 @@ void ham_ops_basic_serial(boost::mpi3::communicator& world)
     }
     WALKER_TYPES WTYPE(initWALKER_TYPES(dims[3]));
     //    int walker_type = dims[3];
-    dump.push(std::string("PsiT_0"));
-    int NEL = (WTYPE == CLOSED) ? NAEA : (NAEA + NAEB);
+    int NEL  = (WTYPE == CLOSED) ? NAEA : (NAEA + NAEB);
+    int NPOL = (WTYPE == NONCOLLINEAR) ? 2 : 1 ;
     //    WALKER_TYPES WTYPE = CLOSED;
     //    if(walker_type==1) WTYPE = COLLINEAR;
     //    if(walker_type==2) WTYPE = NONCOLLINEAR;
@@ -137,32 +135,28 @@ void ham_ops_basic_serial(boost::mpi3::communicator& world)
     Alloc alloc_(make_localTG_allocator<ComplexType>(TG));
     std::vector<PsiT_Matrix> PsiT;
     PsiT.reserve(2);
+    dump.push(std::string("PsiT_0"));
     PsiT.emplace_back(csr_hdf5::HDF2CSR<PsiT_Matrix, shared_allocator<ComplexType>>(dump, gTG.Node()));
-    //PsiT.emplace_back(csr::shm::construct_csr_matrix_single_input<PsiT_Matrix>(
-    //OrbMat[0],1e-8,'H',gTG.Node()));
-    if (WTYPE != CLOSED)
+    if (WTYPE == COLLINEAR)
     {
       dump.pop();
       dump.push(std::string("PsiT_1"));
       PsiT.emplace_back(csr_hdf5::HDF2CSR<PsiT_Matrix, shared_allocator<ComplexType>>(dump, gTG.Node()));
-      //PsiT.emplace_back(csr::shm::construct_csr_matrix_single_input<PsiT_Matrix>(
-      //OrbMat[1](OrbMat.extension(1),{0,NAEB}),
-      //1e-8,'H',gTG.Node()));
-      //}
     }
 
     dump.pop();
+    boost::multi::array<ComplexType, 3> OrbMat({2, NPOL*NMO, NAEA});
     {
-      boost::multi::array<ComplexType, 2> Psi0A({NMO, NAEA});
+      boost::multi::array<ComplexType, 2> Psi0A({NPOL*NMO, NAEA});
       dump.readEntry(Psi0A, "Psi0_alpha");
-      for (int i = 0; i < NMO; i++)
+      for (int i = 0; i < NPOL*NMO; i++)
       {
         for (int j = 0; j < NAEA; j++)
         {
           OrbMat[0][i][j] = Psi0A[i][j];
         }
       }
-      if (WTYPE != CLOSED)
+      if (WTYPE == COLLINEAR)
       {
         boost::multi::array<ComplexType, 2> Psi0B({NMO, NAEA});
         dump.readEntry(Psi0B, "Psi0_beta");
@@ -183,16 +177,16 @@ void ham_ops_basic_serial(boost::mpi3::communicator& world)
 // NOTE: Make small factory routine!
 #if defined(ENABLE_CUDA) || defined(ENABLE_HIP)
     auto SDet(
-        SlaterDetOperations_serial<ComplexType, device_allocator_generator_type>(NMO, NAEA,
+        SlaterDetOperations_serial<ComplexType, device_allocator_generator_type>(NPOL*NMO, NAEA,
                                                                                  afqmc::device_buffer_generator.get()));
 #else
-    auto SDet(SlaterDetOperations_shared<ComplexType>(NMO, NAEA));
+    auto SDet(SlaterDetOperations_shared<ComplexType>(NPOL*NMO, NAEA));
 #endif
 
     boost::multi::array<ComplexType, 3, Alloc> devOrbMat(OrbMat, alloc_);
     std::vector<devcsr_Matrix> devPsiT(move_vector<devcsr_Matrix>(std::move(PsiT)));
 
-    CMatrix G({NEL, NMO}, alloc_);
+    CMatrix G({NEL, NPOL*NMO}, alloc_);
     ComplexType Ovlp = SDet.MixedDensityMatrix(devPsiT[0], devOrbMat[0], G.sliced(0, NAEA), 0.0, true);
     if (WTYPE == COLLINEAR)
     {
@@ -203,7 +197,7 @@ void ham_ops_basic_serial(boost::mpi3::communicator& world)
     REQUIRE(imag(Ovlp) == Approx(0.0));
 
     boost::multi::array<ComplexType, 2, Alloc> Eloc({1, 3}, alloc_);
-    boost::multi::array_ref<ComplexType, 2, pointer> Gw(make_device_ptr(G.origin()), {1, NEL * NMO});
+    boost::multi::array_ref<ComplexType, 2, pointer> Gw(make_device_ptr(G.origin()), {1, NEL * NPOL * NMO});
     // This assumes {nwalk,...} which is not correct for some HamOps
     // make this generic
     HOps.energy(Eloc, Gw, 0, TG.getCoreID() == 0);
diff --git a/src/AFQMC/Hamiltonians/HamiltonianFactory.cpp b/src/AFQMC/Hamiltonians/HamiltonianFactory.cpp
index f7007003ed..afbe7dfab3 100644
--- a/src/AFQMC/Hamiltonians/HamiltonianFactory.cpp
+++ b/src/AFQMC/Hamiltonians/HamiltonianFactory.cpp
@@ -174,6 +174,14 @@ Hamiltonian HamiltonianFactory::fromHDF5(GlobalTaskGroup& gTG, xmlNodePtr cur)
     nkpts = Idata[2];
 #endif
 
+  // MAM: this is wrong in NONCOLLINEAR, but how do I know what 
+  // walker type it is right here???
+  // Might need to read dimensions ahead of time from hdf5 file and check consistensy
+  // later
+  // Also, OneBodyHamiltonian doesn't make much sense now that you have KP classes.
+  // Consider refactoring this part of the code...
+  // It is not really used now, you can just read H1 in Sparse class too...  
+
   // 1 body hamiltonian: Why isn't this in shared memory!!!
   boost::multi::array<ValueType, 2> H1({NMO, NMO});
 
diff --git a/src/AFQMC/Hamiltonians/KPFactorizedHamiltonian.cpp b/src/AFQMC/Hamiltonians/KPFactorizedHamiltonian.cpp
index 54d8eb8ea1..362f441016 100644
--- a/src/AFQMC/Hamiltonians/KPFactorizedHamiltonian.cpp
+++ b/src/AFQMC/Hamiltonians/KPFactorizedHamiltonian.cpp
@@ -84,6 +84,7 @@ HamiltonianOperations KPFactorizedHamiltonian::getHamiltonianOperations_shared(b
     assert(PsiT.size() % 2 == 0);
   int nspins = ((type != COLLINEAR) ? 1 : 2);
   int ndet   = PsiT.size() / nspins;
+  int npol   = ((type == NONCOLLINEAR) ? 2 : 1);  
 
   if (ndet > 1)
     APP_ABORT("Error: ndet > 1 not yet implemented in THCHamiltonian::getHamiltonianOperations.\n");
@@ -228,7 +229,7 @@ HamiltonianOperations KPFactorizedHamiltonian::getHamiltonianOperations_shared(b
 
   int nmo_max   = *std::max_element(nmo_per_kp.begin(), nmo_per_kp.end());
   int nchol_max = *std::max_element(nchol_per_kp.begin(), nchol_per_kp.end());
-  shmCTensor H1({nkpts, nmo_max, nmo_max}, shared_allocator<ComplexType>{TG.Node()});
+  shmCTensor H1({nkpts, npol*nmo_max, npol*nmo_max}, shared_allocator<ComplexType>{TG.Node()});
   std::vector<shmSpMatrix> LQKikn;
   LQKikn.reserve(nkpts);
   for (int Q = 0; Q < nkpts; Q++)
@@ -244,7 +245,7 @@ HamiltonianOperations KPFactorizedHamiltonian::getHamiltonianOperations_shared(b
     for (int Q = 0; Q < nkpts; Q++)
     {
       // until double_hyperslabs work!
-      boost::multi::array<ComplexType, 2> h1({nmo_per_kp[Q], nmo_per_kp[Q]});
+      boost::multi::array<ComplexType, 2> h1({npol*nmo_per_kp[Q], npol*nmo_per_kp[Q]});
       if (!dump.readEntry(h1, std::string("H1_kp") + std::to_string(Q)))
       {
         app_error() << " Error in KPFactorizedHamiltonian::getHamiltonianOperations():"
@@ -253,7 +254,8 @@ HamiltonianOperations KPFactorizedHamiltonian::getHamiltonianOperations_shared(b
       }
       // H1[Q]({0,nmo_per_kp[Q]},{0,nmo_per_kp[Q]}) = h1;
       // using add to get raw pointer dispatch, otherwise matrix copy is going to sync
-      ma::add(ComplexType(1.0), h1, ComplexType(0.0), h1, H1[Q]({0, nmo_per_kp[Q]}, {0, nmo_per_kp[Q]}));
+      ma::add(ComplexType(1.0), h1, ComplexType(0.0), h1, 
+              H1[Q]({0, npol*nmo_per_kp[Q]}, {0, npol*nmo_per_kp[Q]}));
     }
     // read LQ
     if (!dump.push("KPFactorized", false))
@@ -314,7 +316,7 @@ HamiltonianOperations KPFactorizedHamiltonian::getHamiltonianOperations_shared(b
     else
     {
       for (int i = 0; i < ndet; i++)
-        if (not get_nocc_per_kp(nmo_per_kp, PsiT[i], nocc_per_kp[i]))
+        if (not get_nocc_per_kp(nmo_per_kp, PsiT[i], nocc_per_kp[i],npol==2))
         {
           app_error() << " Error in KPFactorizedHamiltonian::getHamiltonianOperations():"
                       << " Only wavefunctions in block-diagonal form are accepted. \n";
@@ -337,7 +339,7 @@ HamiltonianOperations KPFactorizedHamiltonian::getHamiltonianOperations_shared(b
    */
   std::vector<shmSpMatrix> LQKank;
   LQKank.reserve(ndet * nspins * nkpts); // storing 2 components for Q=0, since it is not assumed symmetric
-  shmCMatrix haj({ndet * nkpts, (type == COLLINEAR ? 2 : 1) * nocc_max * nmo_max},
+  shmCMatrix haj({ndet * nkpts, (type == COLLINEAR ? 2 : 1) * nocc_max * npol * nmo_max},
                  shared_allocator<ComplexType>{TG.Node()});
   if (TG.Node().root())
     std::fill_n(haj.origin(), haj.num_elements(), ComplexType(0.0));
@@ -464,16 +466,16 @@ HamiltonianOperations KPFactorizedHamiltonian::getHamiltonianOperations_shared(b
         }
         else
         {
-          auto Psi = get_PsiK<boost::multi::array<ComplexType, 2>>(nmo_per_kp, PsiT[nd], K);
+          auto Psi = get_PsiK<boost::multi::array<ComplexType, 2>>(nmo_per_kp, PsiT[nd], K, npol==2);
           assert(Psi.size(0) == na);
-          boost::multi::array_ref<ComplexType, 2> haj_r(to_address(haj[nd * nkpts + K].origin()), {na, ni});
+          boost::multi::array_ref<ComplexType, 2> haj_r(to_address(haj[nd * nkpts + K].origin()), {na, npol*ni});
           if (na > 0)
-            ma::product(ComplexType(2.0), Psi, H1[K]({0, ni}, {0, ni}), ComplexType(0.0), haj_r);
+            ma::product(ComplexType(2.0), Psi, H1[K]({0, npol*ni}, {0, npol*ni}), ComplexType(0.0), haj_r);
         }
       }
     }
   }
-  // Generate LQKank and haj
+  // Generate LQKank
   for (int nd = 0, nq0 = 0; nd < ndet; nd++, nq0 += nkpts * nspins)
   {
     for (int Q = 0; Q < nkpts; Q++)
@@ -484,7 +486,7 @@ HamiltonianOperations KPFactorizedHamiltonian::getHamiltonianOperations_shared(b
       {
         if (nt % Qcomm.size() == Qcomm.rank())
         {
-          // haj and add half-transformed right-handed rotation for Q=0
+          // add half-transformed right-handed rotation for Q=0
           int Qm    = kminus[Q];
           int QK    = QKtok2[Q][K];
           int na    = nocc_per_kp[nd][K];
@@ -529,7 +531,7 @@ HamiltonianOperations KPFactorizedHamiltonian::getHamiltonianOperations_shared(b
           }
           else
           {
-            auto Psi = get_PsiK<SpMatrix>(nmo_per_kp, PsiT[nd], K);
+            auto Psi = get_PsiK<SpMatrix>(nmo_per_kp, PsiT[nd], K, npol==2);
             assert(Psi.size(0) == na);
             if (Q <= Qm)
             {
@@ -586,7 +588,7 @@ HamiltonianOperations KPFactorizedHamiltonian::getHamiltonianOperations_shared(b
           }
           else
           {
-            auto PsiQK = get_PsiK<SpMatrix>(nmo_per_kp, PsiT[nd], QK);
+            auto PsiQK = get_PsiK<SpMatrix>(nmo_per_kp, PsiT[nd], QK, npol==2);
             assert(PsiQK.size(0) == na);
             Sp3Tensor_ref Lbnl(to_address(LQKbnl[nq0 + Qmap[Q] - 1][QK].origin()), {na, nchol, ni});
             ma_rotate::getLank_from_Lkin(PsiQK, Likn, Lbnl, buff);
@@ -717,8 +719,8 @@ HamiltonianOperations KPFactorizedHamiltonian::getHamiltonianOperations_shared(b
           SpMatrix Tabn({na, nb * nchol});
           Sp3Tensor_ref T3abn(Tabn.origin(), {na, nb, nchol});
 
-          auto Gal = get_PsiK<boost::multi::array<SPComplexType, 2>>(nmo_per_kp, PsiT[0], Ka);
-          auto Gbk = get_PsiK<boost::multi::array<SPComplexType, 2>>(nmo_per_kp, PsiT[0], Kb);
+          auto Gal = get_PsiK<boost::multi::array<SPComplexType, 2>>(nmo_per_kp, PsiT[0], Ka, npol==2);
+          auto Gbk = get_PsiK<boost::multi::array<SPComplexType, 2>>(nmo_per_kp, PsiT[0], Kb, npol==2);
           for (int a = 0; a < na; ++a)
             for (int l = 0; l < nl; ++l)
               Gal[a][l] = ma::conj(Gal[a][l]);
@@ -810,6 +812,7 @@ HamiltonianOperations KPFactorizedHamiltonian::getHamiltonianOperations_batched(
     assert(PsiT.size() % 2 == 0);
   int nspins = ((type != COLLINEAR) ? 1 : 2);
   int ndet   = PsiT.size() / nspins;
+  int npol   = ((type == NONCOLLINEAR) ? 2 : 1);  
 
   if (ndet > 1)
     APP_ABORT("Error: ndet > 1 not yet implemented in THCHamiltonian::getHamiltonianOperations.\n");
@@ -955,7 +958,7 @@ HamiltonianOperations KPFactorizedHamiltonian::getHamiltonianOperations_batched(
 
   int nmo_max   = *std::max_element(nmo_per_kp.begin(), nmo_per_kp.end());
   int nchol_max = *std::max_element(nchol_per_kp.begin(), nchol_per_kp.end());
-  shmCTensor H1({nkpts, nmo_max, nmo_max}, shared_allocator<ComplexType>{TG.Node()});
+  shmCTensor H1({nkpts, npol*nmo_max, npol*nmo_max}, shared_allocator<ComplexType>{TG.Node()});
   std::vector<shmSpMatrix> LQKikn;
   LQKikn.reserve(nkpts);
   for (int Q = 0; Q < nkpts; Q++)
@@ -971,7 +974,7 @@ HamiltonianOperations KPFactorizedHamiltonian::getHamiltonianOperations_batched(
     for (int Q = 0; Q < nkpts; Q++)
     {
       // until double_hyperslabs work!
-      boost::multi::array<ComplexType, 2> h1({nmo_per_kp[Q], nmo_per_kp[Q]});
+      boost::multi::array<ComplexType, 2> h1({npol*nmo_per_kp[Q], npol*nmo_per_kp[Q]});
       if (!dump.readEntry(h1, std::string("H1_kp") + std::to_string(Q)))
       {
         app_error() << " Error in KPFactorizedHamiltonian::getHamiltonianOperations():"
@@ -979,7 +982,8 @@ HamiltonianOperations KPFactorizedHamiltonian::getHamiltonianOperations_batched(
         APP_ABORT("");
       }
       //H1[Q]({0,nmo_per_kp[Q]},{0,nmo_per_kp[Q]}) = h1;
-      ma::add(ComplexType(1.0), h1, ComplexType(0.0), h1, H1[Q]({0, nmo_per_kp[Q]}, {0, nmo_per_kp[Q]}));
+      ma::add(ComplexType(1.0), h1, ComplexType(0.0), h1, 
+              H1[Q]({0, npol*nmo_per_kp[Q]}, {0, npol*nmo_per_kp[Q]}));
     }
   }
   if (distNode.root())
@@ -1054,7 +1058,7 @@ HamiltonianOperations KPFactorizedHamiltonian::getHamiltonianOperations_batched(
     else
     {
       for (int i = 0; i < ndet; i++)
-        if (not get_nocc_per_kp(nmo_per_kp, PsiT[i], nocc_per_kp[i]))
+        if (not get_nocc_per_kp(nmo_per_kp, PsiT[i], nocc_per_kp[i],npol==2))
         {
           app_error() << " Error in KPFactorizedHamiltonian::getHamiltonianOperations():"
                       << " Only wavefunctions in block-diagonal form are accepted. \n";
@@ -1078,7 +1082,7 @@ HamiltonianOperations KPFactorizedHamiltonian::getHamiltonianOperations_batched(
    */
   std::vector<shmSpMatrix> LQKank;
   LQKank.reserve(ndet * nspins * nkpts); // storing 2 components for Q=0, since it is not assumed symmetric
-  shmCMatrix haj({ndet * nkpts, (type == COLLINEAR ? 2 : 1) * nocc_max * nmo_max},
+  shmCMatrix haj({ndet * nkpts, (type == COLLINEAR ? 2 : 1) * nocc_max * npol * nmo_max},
                  shared_allocator<ComplexType>{TG.Node()});
   if (TG.Node().root())
     std::fill_n(to_address(haj.origin()), haj.num_elements(), ComplexType(0.0));
@@ -1224,11 +1228,11 @@ HamiltonianOperations KPFactorizedHamiltonian::getHamiltonianOperations_batched(
         }
         else
         {
-          auto Psi = get_PsiK<boost::multi::array<ComplexType, 2>>(nmo_per_kp, PsiT[nd], K);
+          auto Psi = get_PsiK<boost::multi::array<ComplexType, 2>>(nmo_per_kp, PsiT[nd], K, npol==2);
           assert(Psi.size(0) == na);
-          boost::multi::array_ref<ComplexType, 2> haj_r(to_address(haj[nd * nkpts + K].origin()), {nocc_max, nmo_max});
+          boost::multi::array_ref<ComplexType, 2> haj_r(to_address(haj[nd * nkpts + K].origin()), {nocc_max, npol*nmo_max});
           if (na > 0)
-            ma::product(ComplexType(2.0), Psi, H1[K]({0, ni}, {0, ni}), ComplexType(0.0), haj_r({0, na}, {0, ni}));
+            ma::product(ComplexType(2.0), Psi, H1[K]({0, npol*ni}, {0, npol*ni}), ComplexType(0.0), haj_r({0, na}, {0, npol*ni}));
         }
       }
     }
@@ -1243,7 +1247,7 @@ HamiltonianOperations KPFactorizedHamiltonian::getHamiltonianOperations_batched(
       {
         if (nt % Qcomm.size() == Qcomm.rank())
         {
-          // haj and add half-transformed right-handed rotation for Q=0
+          // add half-transformed right-handed rotation for Q=0
           int Qm = kminus[Q];
           int QK = QKtok2[Q][K];
           int na = nocc_per_kp[nd][K];
@@ -1291,7 +1295,7 @@ HamiltonianOperations KPFactorizedHamiltonian::getHamiltonianOperations_batched(
           }
           else
           {
-            auto Psi = get_PsiK<SpMatrix>(nmo_per_kp, PsiT[nd], K);
+            auto Psi = get_PsiK<SpMatrix>(nmo_per_kp, PsiT[nd], K, npol==2);
             assert(Psi.size(0) == na);
             if (Q <= Qm)
             {
@@ -1350,7 +1354,7 @@ HamiltonianOperations KPFactorizedHamiltonian::getHamiltonianOperations_batched(
           }
           else
           {
-            auto PsiQK = get_PsiK<SpMatrix>(nmo_per_kp, PsiT[nd], QK);
+            auto PsiQK = get_PsiK<SpMatrix>(nmo_per_kp, PsiT[nd], QK, npol==2);
             assert(PsiQK.size(0) == na);
             Sp3Tensor_ref Lbnl(to_address(LQKbnl[nq0 + Qmap[Q] - 1][QK].origin()), {nocc_max, nchol_max, nmo_max});
             Sp3Tensor_ref Lbln(to_address(LQKbln[nq0 + Qmap[Q] - 1][QK].origin()), {nocc_max, nmo_max, nchol_max});
@@ -1496,8 +1500,8 @@ HamiltonianOperations KPFactorizedHamiltonian::getHamiltonianOperations_batched(
           SpMatrix Tabn({na, nb * nchol_max});
           Sp3Tensor_ref T3abn(Tabn.origin(), {na, nb, nchol_max});
 
-          auto Gal = get_PsiK<boost::multi::array<SPComplexType, 2>>(nmo_per_kp, PsiT[0], Ka);
-          auto Gbk = get_PsiK<boost::multi::array<SPComplexType, 2>>(nmo_per_kp, PsiT[0], Kb);
+          auto Gal = get_PsiK<boost::multi::array<SPComplexType, 2>>(nmo_per_kp, PsiT[0], Ka, npol==2);
+          auto Gbk = get_PsiK<boost::multi::array<SPComplexType, 2>>(nmo_per_kp, PsiT[0], Kb, npol==2);
           for (int a = 0; a < na; ++a)
             for (int l = 0; l < nl; ++l)
               Gal[a][l] = ma::conj(Gal[a][l]);
diff --git a/src/AFQMC/Utilities/kp_utilities.hpp b/src/AFQMC/Utilities/kp_utilities.hpp
index 5c1eab4e69..7d7b64002d 100644
--- a/src/AFQMC/Utilities/kp_utilities.hpp
+++ b/src/AFQMC/Utilities/kp_utilities.hpp
@@ -23,18 +23,20 @@
  * If it is block diagonal form, it also returns the number of states in each k-point block. 
  */
 template<class Vector, class CSR, class Array>
-bool get_nocc_per_kp(Vector const& nmo_per_kp, CSR const& PsiT, Array&& nocc_per_kp)
+bool get_nocc_per_kp(Vector const& nmo_per_kp, CSR const& PsiT, Array&& nocc_per_kp, bool noncolin=false)
 {
   int nkpts = nmo_per_kp.size();
   int N     = PsiT.size(0);
   int M     = PsiT.size(1);
+  int npol  = noncolin ? 2 : 1; 
+  assert( M%npol == 0 );
   assert(nocc_per_kp.size() == nkpts);
 
   std::fill_n(to_address(nocc_per_kp.origin()), nkpts, 0);
-  std::vector<int> bounds(nkpts + 1);
+  std::vector<int> bounds(npol*nkpts + 1);
   bounds[0] = 0;
-  for (int k = 0; k < nkpts; k++)
-    bounds[k + 1] = bounds[k] + nmo_per_kp[k];
+  for (int k = 0; k < npol*nkpts; k++)
+    bounds[k + 1] = bounds[k] + nmo_per_kp[k%nkpts];
   int Q = 0;
   for (int i = 0; i < N; i++)
   {
@@ -48,8 +50,10 @@ bool get_nocc_per_kp(Vector const& nmo_per_kp, CSR const& PsiT, Array&& nocc_per
     // check the kp index of the first non-zero column. Mut be either >= Q
     auto it = std::lower_bound(bounds.begin(), bounds.end(), *col + 1) - 1;
     assert(it != bounds.end());
-    int Q_ = std::distance(bounds.begin(), it);
+    int Q_ = std::distance(bounds.begin(), it)%nkpts;
+    int pol_ = std::distance(bounds.begin(), it)/nkpts;
     assert(Q_ >= 0 && Q_ < nkpts);
+    assert( pol_ == 0 || pol_== 1 );
     if (Q_ < Q)
     {
       std::fill_n(to_address(nocc_per_kp.origin()), nkpts, 0);
@@ -58,7 +62,8 @@ bool get_nocc_per_kp(Vector const& nmo_per_kp, CSR const& PsiT, Array&& nocc_per
     Q = Q_;
     for (int ni = 0; ni < nt; ++ni, ++col)
     {
-      if (*col < bounds[Q] || *col >= bounds[Q + 1])
+      if ( (*col < bounds[Q] || *col >= bounds[Q + 1]) &&
+           (*col < bounds[(npol-1)*nkpts + Q] || *col >= bounds[(npol-1)*nkpts + Q + 1]) )
       {
         std::fill_n(to_address(nocc_per_kp.origin()), nkpts, 0);
         return false;
@@ -70,17 +75,19 @@ bool get_nocc_per_kp(Vector const& nmo_per_kp, CSR const& PsiT, Array&& nocc_per
 }
 
 template<class Array, class Vector, class CSR>
-Array get_PsiK(Vector const& nmo_per_kp, CSR const& PsiT, int K)
+Array get_PsiK(Vector const& nmo_per_kp, CSR const& PsiT, int K, bool noncolin=false)
 {
   int nkpts = nmo_per_kp.size();
   int N     = PsiT.size(0);
   int M     = PsiT.size(1);
+  int npol  = noncolin ? 2 : 1;
+  assert( M%npol == 0 );
 
   int nel = 0;
-  std::vector<int> bounds(nkpts + 1);
+  std::vector<int> bounds(npol*nkpts + 1);
   bounds[0] = 0;
-  for (int k = 0; k < nkpts; k++)
-    bounds[k + 1] = bounds[k] + nmo_per_kp[k];
+  for (int k = 0; k < npol*nkpts; k++)
+    bounds[k + 1] = bounds[k] + nmo_per_kp[k%nkpts];
   int Q = 0;
   for (int i = 0; i < N; i++)
   {
@@ -91,19 +98,22 @@ Array get_PsiK(Vector const& nmo_per_kp, CSR const& PsiT, int K)
     // check the kp index of the first non-zero column. Mut be either >= Q
     auto it = std::lower_bound(bounds.begin(), bounds.end(), *col + 1) - 1;
     assert(it != bounds.end());
-    int Q_ = std::distance(bounds.begin(), it);
+    int Q_ = std::distance(bounds.begin(), it)%nkpts;
+    int pol_ = std::distance(bounds.begin(), it)/nkpts;
     assert(Q_ >= 0 && Q_ < nkpts);
+    assert(pol_ == 0 || pol_ == 1);
     if (Q_ < Q)
       APP_ABORT("Error: PsiT not in block-diagonal form in get_PsiK.\n");
     Q = Q_;
     for (int ni = 0; ni < nt; ++ni, ++col)
-      if (*col < bounds[Q] || *col >= bounds[Q + 1])
+      if ( (*col < bounds[Q] || *col >= bounds[Q + 1]) &&
+           (*col < bounds[(npol-1)*nkpts + Q] || *col >= bounds[(npol-1)*nkpts + Q + 1]) )
         APP_ABORT("Error: PsiT not in block-diagonal form in get_PsiK.\n");
     if (Q == K)
       nel++;
   }
   using element = typename std::decay<Array>::type::element;
-  Array A({nel, nmo_per_kp[K]});
+  Array A({nel, npol*nmo_per_kp[K]});
   using std::fill_n;
   fill_n(A.origin(), A.num_elements(), element(0));
   nel = 0;
@@ -114,11 +124,15 @@ Array get_PsiK(Vector const& nmo_per_kp, CSR const& PsiT, int K)
     auto val = PsiT.non_zero_values_data(i);
     // check the kp index of the first non-zero column. Mut be either >= Q
     auto it = std::lower_bound(bounds.begin(), bounds.end(), *col + 1) - 1;
-    int Q   = std::distance(bounds.begin(), it);
+    int Q   = std::distance(bounds.begin(), it)%nkpts;
     if (Q == K)
     {
-      for (int ni = 0; ni < nt; ++ni, ++col, ++val)
-        A[nel][*col - bounds[K]] = static_cast<element>(*val);
+      for (int ni = 0; ni < nt; ++ni, ++col, ++val) {
+        if(*col < bounds[K+1])  // alpha
+          A[nel][*col - bounds[K]] = static_cast<element>(*val);
+        else                    // beta
+          A[nel][*col - bounds[nkpts + K] + nmo_per_kp[K]] = static_cast<element>(*val);
+      }
       nel++;
     }
     if (Q > K)

From 67753f150dd3d496aab7ca7397934d69bdbf9e16 Mon Sep 17 00:00:00 2001
From: mmorale3 <miguel.mmorales@gmail.com>
Date: Mon, 14 Sep 2020 17:58:06 -0700
Subject: [PATCH 04/10] fixed HamOps unit test to handle all types, fixed
 cholesky rotation routines and get_psiK to handle noncollinear, still not
 done with KP ham ops

---
 .../KP3IndexFactorization.hpp                 |  6 +-
 .../tests/test_hamiltonian_operations.cpp     | 79 +++++++++-------
 .../Hamiltonians/KPFactorizedHamiltonian.cpp  | 48 +++++-----
 .../SlaterDeterminantOperations/rotate.hpp    | 90 ++++++++++++++-----
 4 files changed, 141 insertions(+), 82 deletions(-)

diff --git a/src/AFQMC/HamiltonianOperations/KP3IndexFactorization.hpp b/src/AFQMC/HamiltonianOperations/KP3IndexFactorization.hpp
index 98061a7054..768da6b2b4 100644
--- a/src/AFQMC/HamiltonianOperations/KP3IndexFactorization.hpp
+++ b/src/AFQMC/HamiltonianOperations/KP3IndexFactorization.hpp
@@ -288,8 +288,7 @@ class KP3IndexFactorization
     int getKr = KEright != nullptr;
     int getKl = KEleft != nullptr;
     if (E.size(0) != nwalk || E.size(1) < 3)
-      APP_ABORT(" Error in AFQMC/HamiltonianOperations/sparse_matrix_energy::calculate_energy(). Incorrect matrix "
-                "dimensions \n");
+      APP_ABORT(" Error in AFQMC/HamiltonianOperations/KP3IndexFactorization::energy(). Incorrect matrix dimensions \n");
 
     size_t mem_needs(nwalk * nkpts * nkpts * nspin * nocca_max * nmo_max);
     size_t cnt(0);
@@ -659,8 +658,7 @@ class KP3IndexFactorization
     int getKr = KEright != nullptr;
     int getKl = KEleft != nullptr;
     if (E.size(0) != nwalk || E.size(1) < 3)
-      APP_ABORT(" Error in AFQMC/HamiltonianOperations/sparse_matrix_energy::calculate_energy(). Incorrect matrix "
-                "dimensions \n");
+      APP_ABORT(" Error in AFQMC/HamiltonianOperations/KP3IndexFactorization::energy(). Incorrect matrix dimensions\n");
 
     size_t mem_needs(nwalk * nkpts * nkpts * nspin * nocca_max * nmo_max);
     size_t cnt(0);
diff --git a/src/AFQMC/HamiltonianOperations/tests/test_hamiltonian_operations.cpp b/src/AFQMC/HamiltonianOperations/tests/test_hamiltonian_operations.cpp
index 554ab201f6..63fa7c2251 100644
--- a/src/AFQMC/HamiltonianOperations/tests/test_hamiltonian_operations.cpp
+++ b/src/AFQMC/HamiltonianOperations/tests/test_hamiltonian_operations.cpp
@@ -197,10 +197,15 @@ void ham_ops_basic_serial(boost::mpi3::communicator& world)
     REQUIRE(imag(Ovlp) == Approx(0.0));
 
     boost::multi::array<ComplexType, 2, Alloc> Eloc({1, 3}, alloc_);
-    boost::multi::array_ref<ComplexType, 2, pointer> Gw(make_device_ptr(G.origin()), {1, NEL * NPOL * NMO});
-    // This assumes {nwalk,...} which is not correct for some HamOps
-    // make this generic
-    HOps.energy(Eloc, Gw, 0, TG.getCoreID() == 0);
+    {
+      int nc=1,nr=NEL * NPOL * NMO;
+      if(HOps.transposed_G_for_E()) {
+        nr=1;
+        nc=NEL * NPOL * NMO;
+      }
+      boost::multi::array_ref<ComplexType, 2, pointer> Gw(make_device_ptr(G.origin()), {nr, nc});
+      HOps.energy(Eloc, Gw, 0, TG.getCoreID() == 0);
+    }
     Eloc[0][0] = (TG.Node() += ComplexType(Eloc[0][0]));
     Eloc[0][1] = (TG.Node() += ComplexType(Eloc[0][1]));
     Eloc[0][2] = (TG.Node() += ComplexType(Eloc[0][2]));
@@ -229,38 +234,46 @@ void ham_ops_basic_serial(boost::mpi3::communicator& world)
     auto nCV      = HOps.local_number_of_cholesky_vectors();
 
     CMatrix X({nCV, 1}, alloc_);
-    //HOps.vbias(Gw,X,sqrtdt);
+    {
+      int nc=1,nr=NEL * NPOL * NMO;
+      if(HOps.transposed_G_for_vbias()) {
+        nr=1;
+        nc=NEL * NPOL * NMO;
+      }
+      boost::multi::array_ref<ComplexType, 2, pointer> Gw(make_device_ptr(G.origin()), {nr, nc});
+      HOps.vbias(Gw,X,sqrtdt);
+    }
     TG.local_barrier();
     ComplexType Xsum = 0;
-    //for(int i=0; i<X.size(); i++)
-    //Xsum += X[i][0];
-    //if(std::abs(file_data.Xsum)>1e-8) {
-    //REQUIRE( real(Xsum) == Approx(real(file_data.Xsum)) );
-    //REQUIRE( imag(Xsum) == Approx(imag(file_data.Xsum)) );
-    //} else {
-    //app_log()<<" Xsum: " <<setprecision(12) <<Xsum <<std::endl;
-    //}
+    for(int i=0; i<X.size(); i++)
+      Xsum += X[i][0];
+    if(std::abs(file_data.Xsum)>1e-8) {
+      REQUIRE( real(Xsum) == Approx(real(file_data.Xsum)) );
+      REQUIRE( imag(Xsum) == Approx(imag(file_data.Xsum)) );
+    } else {
+      app_log()<<" Xsum: " <<setprecision(12) <<Xsum <<std::endl;
+    }
 
-    //int vdim1 = (HOps.transposed_vHS()?1:NMO*NMO);
-    //int vdim2 = (HOps.transposed_vHS()?NMO*NMO:1);
-    //CMatrix vHS({vdim1,vdim2},alloc_);
-    //TG.local_barrier();
-    //HOps.vHS(X,vHS,sqrtdt);
-    //TG.local_barrier();
-    //ComplexType Vsum=0;
-    //if(HOps.transposed_vHS()) {
-    //for(int i=0; i<vHS.size(1); i++)
-    //Vsum += vHS[0][i];
-    //} else {
-    //for(int i=0; i<vHS.size(0); i++)
-    //Vsum += vHS[i][0];
-    //}
-    //if(std::abs(file_data.Vsum)>1e-8) {
-    //REQUIRE( real(Vsum) == Approx(real(file_data.Vsum)) );
-    //REQUIRE( imag(Vsum) == Approx(imag(file_data.Vsum)) );
-    //} else {
-    //app_log()<<" Vsum: " <<setprecision(12) <<Vsum <<std::endl;
-    //}
+    int vdim1 = (HOps.transposed_vHS()?1:NMO*NMO);
+    int vdim2 = (HOps.transposed_vHS()?NMO*NMO:1);
+    CMatrix vHS({vdim1,vdim2},alloc_);
+    TG.local_barrier();
+    HOps.vHS(X,vHS,sqrtdt);
+    TG.local_barrier();
+    ComplexType Vsum=0;
+    if(HOps.transposed_vHS()) {
+      for(int i=0; i<vHS.size(1); i++)
+        Vsum += vHS[0][i];
+    } else {
+        for(int i=0; i<vHS.size(0); i++)
+          Vsum += vHS[i][0];
+    }
+    if(std::abs(file_data.Vsum)>1e-8) {
+      REQUIRE( real(Vsum) == Approx(real(file_data.Vsum)) );
+      REQUIRE( imag(Vsum) == Approx(imag(file_data.Vsum)) );
+    } else {
+      app_log()<<" Vsum: " <<setprecision(12) <<Vsum <<std::endl;
+    }
     // Test Generalised Fock matrix.
     int dm_size;
     CMatrix G2({1, 1}, alloc_);
diff --git a/src/AFQMC/Hamiltonians/KPFactorizedHamiltonian.cpp b/src/AFQMC/Hamiltonians/KPFactorizedHamiltonian.cpp
index 362f441016..ff91b39dd4 100644
--- a/src/AFQMC/Hamiltonians/KPFactorizedHamiltonian.cpp
+++ b/src/AFQMC/Hamiltonians/KPFactorizedHamiltonian.cpp
@@ -348,7 +348,7 @@ HamiltonianOperations KPFactorizedHamiltonian::getHamiltonianOperations_shared(b
   {
     for (int Q = 0; Q < nkpts; Q++)
       if (Qmap[Q] >= 0)
-        LQKank.emplace_back(shmSpMatrix({nkpts, ank_max}, shared_allocator<SPComplexType>{TG.Node()}));
+        LQKank.emplace_back(shmSpMatrix({nkpts, npol * ank_max}, shared_allocator<SPComplexType>{TG.Node()}));
       else
         LQKank.emplace_back(shmSpMatrix({1, 1}, shared_allocator<SPComplexType>{TG.Node()}));
     if (type == COLLINEAR)
@@ -388,7 +388,7 @@ HamiltonianOperations KPFactorizedHamiltonian::getHamiltonianOperations_shared(b
   for (int nd = 0; nd < ndet; nd++)
   {
     for (int Q = 0; Q < number_of_symmetric_Q; Q++)
-      LQKbnl.emplace_back(shmSpMatrix({nkpts, ank_max}, shared_allocator<SPComplexType>{TG.Node()}));
+      LQKbnl.emplace_back(shmSpMatrix({nkpts, npol * ank_max}, shared_allocator<SPComplexType>{TG.Node()}));
     if (type == COLLINEAR)
     {
       for (int Q = 0; Q < number_of_symmetric_Q; Q++)
@@ -435,7 +435,7 @@ HamiltonianOperations KPFactorizedHamiltonian::getHamiltonianOperations_shared(b
   if (Q0 < 0)
     APP_ABORT(" Error: Could not find Q=0. \n");
 
-  boost::multi::array<SPComplexType, 2> buff({nmo_max, nchol_max});
+  boost::multi::array<SPComplexType, 2> buff({npol*nmo_max, nchol_max});
   int nt = 0;
   for (int nd = 0; nd < ndet; nd++)
   {
@@ -536,14 +536,14 @@ HamiltonianOperations KPFactorizedHamiltonian::getHamiltonianOperations_shared(b
             if (Q <= Qm)
             {
               Sp3Tensor_ref Likn(to_address(LQKikn[Q][K].origin()), {ni, nk, nchol});
-              Sp3Tensor_ref Lank(to_address(LQKank[nq0 + Q][K].origin()), {na, nchol, nk});
-              ma_rotate::getLank(Psi, Likn, Lank, buff);
+              Sp3Tensor_ref Lank(to_address(LQKank[nq0 + Q][K].origin()), {na*npol, nchol, nk});
+              ma_rotate::getLank(Psi, Likn, Lank, buff, npol==2);
             }
             else
             {
               Sp3Tensor_ref Lkin(to_address(LQKikn[Qm][QK].origin()), {nk, ni, nchol});
-              Sp3Tensor_ref Lank(to_address(LQKank[nq0 + Q][K].origin()), {na, nchol, nk});
-              ma_rotate::getLank_from_Lkin(Psi, Lkin, Lank, buff);
+              Sp3Tensor_ref Lank(to_address(LQKank[nq0 + Q][K].origin()), {na*npol, nchol, nk});
+              ma_rotate::getLank_from_Lkin(Psi, Lkin, Lank, buff, npol==2);
             }
           }
         }
@@ -590,8 +590,8 @@ HamiltonianOperations KPFactorizedHamiltonian::getHamiltonianOperations_shared(b
           {
             auto PsiQK = get_PsiK<SpMatrix>(nmo_per_kp, PsiT[nd], QK, npol==2);
             assert(PsiQK.size(0) == na);
-            Sp3Tensor_ref Lbnl(to_address(LQKbnl[nq0 + Qmap[Q] - 1][QK].origin()), {na, nchol, ni});
-            ma_rotate::getLank_from_Lkin(PsiQK, Likn, Lbnl, buff);
+            Sp3Tensor_ref Lbnl(to_address(LQKbnl[nq0 + Qmap[Q] - 1][QK].origin()), {na*npol, nchol, ni});
+            ma_rotate::getLank_from_Lkin(PsiQK, Likn, Lbnl, buff, npol==2);
           }
         }
       }
@@ -683,6 +683,9 @@ HamiltonianOperations KPFactorizedHamiltonian::getHamiltonianOperations_shared(b
     app_log() << " Sampling EXX energy using distribution over Q vector obtained from "
               << " trial energy. \n";
 
+    if(npol == 2) 
+      APP_ABORT("Error: nsampleQ>0 not yet implemented for noncollinear.\n\n\n");
+
     RealType scl = (type == CLOSED ? 2.0 : 1.0);
     size_t nqk   = 0;
     for (int Q = 0; Q < nkpts; ++Q)
@@ -1091,7 +1094,7 @@ HamiltonianOperations KPFactorizedHamiltonian::getHamiltonianOperations_batched(
   {
     for (int Q = 0; Q < nkpts; Q++)
       if (Qmap[Q] >= 0)
-        LQKank.emplace_back(shmSpMatrix({nkpts, ank_max}, shared_allocator<SPComplexType>{distNode}));
+        LQKank.emplace_back(shmSpMatrix({nkpts, npol * ank_max}, shared_allocator<SPComplexType>{distNode}));
       else
         LQKank.emplace_back(shmSpMatrix({1, 1}, shared_allocator<SPComplexType>{distNode}));
     if (type == COLLINEAR)
@@ -1114,7 +1117,7 @@ HamiltonianOperations KPFactorizedHamiltonian::getHamiltonianOperations_batched(
     for (int Q = 0; Q < nkpts; Q++)
     {
       if (Qmap[Q] >= 0)
-        LQKakn.emplace_back(shmSpMatrix({nkpts, ank_max}, shared_allocator<SPComplexType>{distNode}));
+        LQKakn.emplace_back(shmSpMatrix({nkpts, npol * ank_max}, shared_allocator<SPComplexType>{distNode}));
       else
         LQKakn.emplace_back(shmSpMatrix({1, 1}, shared_allocator<SPComplexType>{distNode}));
     }
@@ -1194,7 +1197,7 @@ HamiltonianOperations KPFactorizedHamiltonian::getHamiltonianOperations_batched(
     APP_ABORT(" Error: Could not find Q=0. \n");
 
   TG.Node().barrier();
-  boost::multi::array<SPComplexType, 2> buff({nmo_max, nchol_max});
+  boost::multi::array<SPComplexType, 2> buff({npol*nmo_max, nchol_max});
   int nt = 0;
   for (int nd = 0; nd < ndet; nd++)
   {
@@ -1300,16 +1303,16 @@ HamiltonianOperations KPFactorizedHamiltonian::getHamiltonianOperations_batched(
             if (Q <= Qm)
             {
               Sp3Tensor_ref Likn(to_address(LQKikn[Q][K].origin()), {nmo_max, nmo_max, nchol_max});
-              Sp3Tensor_ref Lakn(to_address(LQKakn[nq0 + Q][K].origin()), {nocc_max, nmo_max, nchol_max});
-              Sp3Tensor_ref Lank(to_address(LQKank[nq0 + Q][K].origin()), {nocc_max, nchol_max, nmo_max});
-              ma_rotate_padded::getLakn_Lank(Psi, Likn, Lakn, Lank);
+              Sp3Tensor_ref Lakn(to_address(LQKakn[nq0 + Q][K].origin()), {npol*nocc_max, nmo_max, nchol_max});
+              Sp3Tensor_ref Lank(to_address(LQKank[nq0 + Q][K].origin()), {npol*nocc_max, nchol_max, nmo_max});
+              ma_rotate_padded::getLakn_Lank(Psi, Likn, Lakn, Lank, npol==2);
             }
             else
             {
               Sp3Tensor_ref Lkin(to_address(LQKikn[Qm][QK].origin()), {nmo_max, nmo_max, nchol_max});
-              Sp3Tensor_ref Lakn(to_address(LQKakn[nq0 + Q][K].origin()), {nocc_max, nmo_max, nchol_max});
-              Sp3Tensor_ref Lank(to_address(LQKank[nq0 + Q][K].origin()), {nocc_max, nchol_max, nmo_max});
-              ma_rotate_padded::getLakn_Lank_from_Lkin(Psi, Lkin, Lakn, Lank, buff);
+              Sp3Tensor_ref Lakn(to_address(LQKakn[nq0 + Q][K].origin()), {npol*nocc_max, nmo_max, nchol_max});
+              Sp3Tensor_ref Lank(to_address(LQKank[nq0 + Q][K].origin()), {npol*nocc_max, nchol_max, nmo_max});
+              ma_rotate_padded::getLakn_Lank_from_Lkin(Psi, Lkin, Lakn, Lank, buff, npol==2);
             }
           }
         }
@@ -1356,9 +1359,9 @@ HamiltonianOperations KPFactorizedHamiltonian::getHamiltonianOperations_batched(
           {
             auto PsiQK = get_PsiK<SpMatrix>(nmo_per_kp, PsiT[nd], QK, npol==2);
             assert(PsiQK.size(0) == na);
-            Sp3Tensor_ref Lbnl(to_address(LQKbnl[nq0 + Qmap[Q] - 1][QK].origin()), {nocc_max, nchol_max, nmo_max});
-            Sp3Tensor_ref Lbln(to_address(LQKbln[nq0 + Qmap[Q] - 1][QK].origin()), {nocc_max, nmo_max, nchol_max});
-            ma_rotate_padded::getLakn_Lank_from_Lkin(PsiQK, Likn, Lbln, Lbnl, buff);
+            Sp3Tensor_ref Lbnl(to_address(LQKbnl[nq0 + Qmap[Q] - 1][QK].origin()), {npol*nocc_max, nchol_max, nmo_max});
+            Sp3Tensor_ref Lbln(to_address(LQKbln[nq0 + Qmap[Q] - 1][QK].origin()), {npol*nocc_max, nmo_max, nchol_max});
+            ma_rotate_padded::getLakn_Lank_from_Lkin(PsiQK, Likn, Lbln, Lbnl, buff, npol==2);
           }
         }
       }
@@ -1464,6 +1467,9 @@ HamiltonianOperations KPFactorizedHamiltonian::getHamiltonianOperations_batched(
     app_log() << " Sampling EXX energy using distribution over Q vector obtained from "
               << " trial energy. \n";
 
+    if(npol == 2) 
+      APP_ABORT("Error: nsampleQ>0 not yet implemented for noncollinear.\n\n\n");
+
     RealType scl = (type == CLOSED ? 2.0 : 1.0);
     size_t nqk   = 0;
     for (int Q = 0; Q < nkpts; ++Q)
diff --git a/src/AFQMC/SlaterDeterminantOperations/rotate.hpp b/src/AFQMC/SlaterDeterminantOperations/rotate.hpp
index 311202a3d6..86c98c02b8 100644
--- a/src/AFQMC/SlaterDeterminantOperations/rotate.hpp
+++ b/src/AFQMC/SlaterDeterminantOperations/rotate.hpp
@@ -612,28 +612,41 @@ void halfRotateCholeskyMatrix(WALKER_TYPES type,
 }
 
 // design for compact arrays
+/*
+ * Constructs the following contraction of the Slater matrix of the trial wfn and the Cholesky Matrix (Likn).
+ * Case:
+ *   - Closed/Collinear:  L[a][n][k] = sum_i A[a][i] L[i][k][n]
+ *       - In collinear case, two separate calls are made for each spin channel.
+ *   - Non-collinear: L[a][s][n][k] = sum_i A[a][si] L[i][k][n]   // [si] == [s][i] combined spinor index
+ *       - In this case, to preserve matrix dimenions, [a][s] --> [as] is kept as a single index, peculiar order is given by its use in Energy evaluation. 
+ */
 template<class MultiArray2DA, class MultiArray3DB, class MultiArray3DC, class MultiArray2D>
-void getLank(MultiArray2DA&& Aai, MultiArray3DB&& Likn, MultiArray3DC&& Lank, MultiArray2D&& buff)
+void getLank(MultiArray2DA&& Aai, MultiArray3DB&& Likn, MultiArray3DC&& Lank, MultiArray2D&& buff, bool noncollinear=false)
 {
+  int npol = noncollinear ? 2 : 1;
   int na = Aai.size(0);
   if (na == 0)
     return;
-  int ni    = Aai.size(1);
+  int ni    = Aai.size(1) / npol;
   int nk    = Likn.size(1);
   int nchol = Likn.size(2);
   assert(Likn.size(0) == ni);
-  assert(Lank.size(0) == na);
+  assert(Lank.size(0) == na*npol);
   assert(Lank.size(1) == nchol);
   assert(Lank.size(2) == nk);
   assert(buff.size(0) >= nk);
   assert(buff.size(1) >= nchol);
+  if(noncollinear)
+    assert(Aai.stride(0) == Aai.size(1)); // make sure it is contiguous
 
+  using elementA = typename std::decay<MultiArray2DA>::type::element;
   using element = typename std::decay<MultiArray3DC>::type::element;
+  boost::multi::array_ref<elementA, 2> Aas_i(to_address(Aai.origin()), {na * npol, ni});
   boost::multi::array_ref<element, 2> Li_kn(to_address(Likn.origin()), {ni, nk * nchol});
-  boost::multi::array_ref<element, 2> La_kn(to_address(Lank.origin()), {na, nk * nchol});
+  boost::multi::array_ref<element, 2> Las_kn(to_address(Lank.origin()), {na * npol, nk * nchol});
 
-  ma::product(Aai, Li_kn, La_kn);
-  for (int a = 0; a < na; a++)
+  ma::product(Aas_i, Li_kn, Las_kn);
+  for (int a = 0; a < na * npol; a++)
   {
     boost::multi::array_ref<element, 2> Lkn(to_address(Lank[a].origin()), {nk, nchol});
     boost::multi::array_ref<element, 2> Lnk(to_address(Lank[a].origin()), {nchol, nk});
@@ -642,28 +655,42 @@ void getLank(MultiArray2DA&& Aai, MultiArray3DB&& Likn, MultiArray3DC&& Lank, Mu
   }
 }
 
+/*
+ * Constructs the following contraction of the Slater matrix of the trial wfn and the Cholesky Matrix (Likn).
+ * Case:
+ *   - Closed/Collinear:  L[a][n][k] = sum_i A[a][i] conj(L[k][i][n])
+ *       - In collinear case, two separate calls are made for each spin channel.
+ *   - Non-collinear: L[a][s][n][k] = sum_i A[a][si] conj(L[k][i][n])   // [si] == [s][i] combined spinor index
+ *       - In this case, to preserve matrix dimenions, [a][s] --> [as] is kept as a single index, peculiar order is given by its use in Energy evaluation. 
+ */
 template<class MultiArray2DA, class MultiArray3DB, class MultiArray3DC, class MultiArray2D>
-void getLank_from_Lkin(MultiArray2DA&& Aai, MultiArray3DB&& Lkin, MultiArray3DC&& Lank, MultiArray2D&& buff)
+void getLank_from_Lkin(MultiArray2DA&& Aai, MultiArray3DB&& Lkin, MultiArray3DC&& Lank, MultiArray2D&& buff, bool noncollinear=false)
 {
+  int npol = noncollinear ? 2 : 1;
   int na = Aai.size(0);
   if (na == 0)
     return;
-  int ni    = Aai.size(1);
+  int ni    = Aai.size(1) / npol;
   int nk    = Lkin.size(0);
   int nchol = Lkin.size(2);
   assert(Lkin.size(1) == ni);
-  assert(Lank.size(0) == na);
+  assert(Lank.size(0) == na * npol);
   assert(Lank.size(1) == nchol);
   assert(Lank.size(2) == nk);
-  assert(buff.num_elements() >= na * nchol);
+  assert(buff.num_elements() >= na * npol * nchol);
+  if(noncollinear)
+    assert(Aai.stride(0) == Aai.size(1)); // make sure it is contiguous
 
   using Type = typename std::decay<MultiArray3DC>::type::element;
-  boost::multi::array_ref<Type, 2> bna(to_address(buff.origin()), {nchol, na});
+  using elementA = typename std::decay<MultiArray2DA>::type::element;
+  boost::multi::array_ref<elementA, 2> Aas_i(to_address(Aai.origin()), {na * npol, ni});
+  boost::multi::array_ref<Type, 2> bna(to_address(buff.origin()), {nchol, na * npol});
   // Lank[a][n][k] = sum_i Aai[a][i] conj(Lkin[k][i][n])
+  // Lank[as][n][k] = sum_i Aai[as][i] conj(Lkin[k][i][n])
   for (int k = 0; k < nk; k++)
   {
-    ma::product(ma::H(Lkin[k]), ma::T(Aai), bna);
-    for (int a = 0; a < na; a++)
+    ma::product(ma::H(Lkin[k]), ma::T(Aas_i), bna);
+    for (int a = 0; a < na * npol; a++)
       for (int n = 0; n < nchol; n++)
         Lank[a][n][k] = bna[n][a];
   }
@@ -675,12 +702,13 @@ namespace ma_rotate_padded
 {
 // designed for padded arrays
 template<class MultiArray2DA, class MultiArray3DB, class MultiArray3DC>
-void getLakn_Lank(MultiArray2DA&& Aai, MultiArray3DB&& Likn, MultiArray3DC&& Lakn, MultiArray3DC&& Lank)
+void getLakn_Lank(MultiArray2DA&& Aai, MultiArray3DB&& Likn, MultiArray3DC&& Lakn, MultiArray3DC&& Lank, bool noncollinear=false)
 {
+  int npol = noncollinear ? 2 : 1;
   int na = Aai.size(0);
   if (na == 0)
     return;
-  int ni = Aai.size(1);
+  int ni = Aai.size(1) / npol;
 
   int nmo   = Likn.size(0);
   int nchol = Likn.size(2);
@@ -689,17 +717,23 @@ void getLakn_Lank(MultiArray2DA&& Aai, MultiArray3DB&& Likn, MultiArray3DC&& Lak
   assert(Lakn.size(1) == nmo);
   assert(Lakn.size(2) == nchol);
 
+  assert(Lakn.size(0) >= npol*na );
   assert(Lakn.size(0) == Lank.size(0));
   assert(Lank.size(1) == nchol);
   assert(Lank.size(2) == nmo);
 
+  if(noncollinear)
+    assert(Aai.stride(0) == Aai.size(1)); // make sure it is contiguous
+
+  using elmA = typename std::decay<MultiArray2DA>::type::element;
   using elmB = typename std::decay<MultiArray3DB>::type::element;
   using elmC = typename std::decay<MultiArray3DC>::type::element;
 
+  boost::multi::array_ref<elmA, 2> Aas_i(to_address(Aai.origin()), {na * npol, ni});
   boost::multi::array_ref<elmB, 2, decltype(Likn.origin())> Li_kn(Likn.origin(), {ni, nmo * nchol});
-  boost::multi::array_ref<elmC, 2, decltype(Lakn.origin())> La_kn(Lakn.origin(), {na, nmo * nchol});
+  boost::multi::array_ref<elmC, 2, decltype(Lakn.origin())> Las_kn(Lakn.origin(), {na * npol, nmo * nchol});
 
-  ma::product(Aai, Li_kn, La_kn);
+  ma::product(Aas_i, Li_kn, Las_kn);
   for (int a = 0; a < na; a++)
     ma::transpose(Lakn[a], Lank[a]);
 }
@@ -709,12 +743,14 @@ void getLakn_Lank_from_Lkin(MultiArray2DA&& Aai,
                             MultiArray3DB&& Lkin,
                             MultiArray3DC&& Lakn,
                             MultiArray3DC&& Lank,
-                            MultiArray2D&& buff)
+                            MultiArray2D&& buff,
+                            bool noncollinear=false)
 {
+  int npol = noncollinear ? 2 : 1;
   int na = Aai.size(0);
   if (na == 0)
     return;
-  int ni = Aai.size(1);
+  int ni = Aai.size(1) / npol;
 
   int nmo   = Lkin.size(0);
   int nchol = Lkin.size(2);
@@ -723,24 +759,30 @@ void getLakn_Lank_from_Lkin(MultiArray2DA&& Aai,
   assert(Lakn.size(1) == nmo);
   assert(Lakn.size(2) == nchol);
 
+  assert(Lakn.size(0) >= npol*na );
   assert(Lakn.size(0) == Lank.size(0));
   assert(Lank.size(1) == nchol);
   assert(Lank.size(2) == nmo);
 
-  assert(buff.num_elements() >= na * nchol);
+  if(noncollinear)
+    assert(Aai.stride(0) == Aai.size(1)); // make sure it is contiguous
+
+  assert(buff.num_elements() >= na * npol * nchol);
 
   using ptr2 = typename std::decay<MultiArray2D>::type::element_ptr;
   using elm2 = typename std::decay<MultiArray2D>::type::element;
+  using elmA = typename std::decay<MultiArray2DA>::type::element;
 
-  boost::multi::array_ref<elm2, 2, ptr2> bna(buff.origin(), {nchol, na});
-  // Lakn[a][k][n] = sum_i Aai[a][i] conj(Lkin[k][i][n])
+  boost::multi::array_ref<elmA, 2> Aas_i(to_address(Aai.origin()), {na * npol, ni});
+  boost::multi::array_ref<elm2, 2, ptr2> bna(buff.origin(), {nchol, na * npol});
+  // Lakn[as][k][n] = sum_i Aai[as][i] conj(Lkin[k][i][n])
   for (int k = 0; k < nmo; k++)
   {
     ma::product(ma::H(Lkin[k].sliced(0, ni)), ma::T(Aai), bna);
-    for (int a = 0; a < na; a++)
+    for (int a = 0; a < na * npol; a++)
       Lakn[a][k] = bna({0, nchol}, a);
   }
-  for (int a = 0; a < na; a++)
+  for (int a = 0; a < na * npol; a++)
     ma::transpose(Lakn[a], Lank[a]);
 }
 

From 8cc9a130bdb30815766b2a0de4bd2bf7f82a33c0 Mon Sep 17 00:00:00 2001
From: mmorale3 <miguel.mmorales@gmail.com>
Date: Tue, 15 Sep 2020 12:33:27 -0700
Subject: [PATCH 05/10] noncollinear changes to SlaterDetOps and Wfn, unit
 tests in both pass on cpu

---
 .../KP3IndexFactorization.hpp                 | 127 +++++++++++-------
 .../tests/test_hamiltonian_operations.cpp     |   7 +-
 .../Hamiltonians/KPFactorizedHamiltonian.cpp  |  24 ++--
 .../SlaterDetOperations_base.hpp              |  23 +++-
 .../SlaterDetOperations_serial.hpp            |  42 ++++--
 .../SlaterDetOperations_shared.hpp            |  23 +++-
 .../apply_expM.hpp                            |  11 +-
 .../SlaterDeterminantOperations/rotate.hpp    |  62 ++++-----
 src/AFQMC/Wavefunctions/NOMSD.hpp             |   4 +-
 .../Wavefunctions/WavefunctionFactory.cpp     |  48 +++----
 .../Wavefunctions/tests/test_wfn_factory.cpp  |  10 +-
 11 files changed, 233 insertions(+), 148 deletions(-)

diff --git a/src/AFQMC/HamiltonianOperations/KP3IndexFactorization.hpp b/src/AFQMC/HamiltonianOperations/KP3IndexFactorization.hpp
index 768da6b2b4..3f95758b0b 100644
--- a/src/AFQMC/HamiltonianOperations/KP3IndexFactorization.hpp
+++ b/src/AFQMC/HamiltonianOperations/KP3IndexFactorization.hpp
@@ -277,6 +277,7 @@ class KP3IndexFactorization
 
     int nwalk     = Gc.size(1);
     int nspin     = (walker_type == COLLINEAR ? 2 : 1);
+    int npol      = (walker_type == NONCOLLINEAR ? 2 : 1);
     int nmo_tot   = std::accumulate(nopk.begin(), nopk.end(), 0);
     int nmo_max   = *std::max_element(nopk.begin(), nopk.end());
     int nocca_tot = std::accumulate(nelpk[nd].begin(), nelpk[nd].begin() + nkpts, 0);
@@ -290,7 +291,7 @@ class KP3IndexFactorization
     if (E.size(0) != nwalk || E.size(1) < 3)
       APP_ABORT(" Error in AFQMC/HamiltonianOperations/KP3IndexFactorization::energy(). Incorrect matrix dimensions \n");
 
-    size_t mem_needs(nwalk * nkpts * nkpts * nspin * nocca_max * nmo_max);
+    size_t mem_needs(nwalk * nkpts * nkpts * nspin * nocca_max * nmo_max * npol);
     size_t cnt(0);
     if (addEJ)
     {
@@ -366,35 +367,35 @@ class KP3IndexFactorization
     for (int n = 0; n < nwalk; n++)
       std::fill_n(E[n].origin(), 3, ComplexType(0.));
 
-    assert(Gc.num_elements() == nwalk * (nocca_tot + noccb_tot) * nmo_tot);
-    boost::multi::array_cref<ComplexType, 3> G3Da(to_address(Gc.origin()), {nocca_tot, nmo_tot, nwalk});
+    assert(Gc.num_elements() == nwalk * (nocca_tot + noccb_tot) * npol * nmo_tot);
+    boost::multi::array_cref<ComplexType, 3> G3Da(to_address(Gc.origin()), {nocca_tot * npol, nmo_tot, nwalk});
     boost::multi::array_cref<ComplexType, 3> G3Db(to_address(Gc.origin()) + G3Da.num_elements() * (nspin - 1),
                                                   {noccb_tot, nmo_tot, nwalk});
 
 
     // with yet another mapping, it is possible to reduce the memory usage here!
     // avoiding for now!
-    Sp4Tensor_ref GKK(to_address(SM_TMats.origin()) + cnt, {nspin, nkpts, nkpts, nwalk * nmo_max * nocca_max});
+    Sp4Tensor_ref GKK(to_address(SM_TMats.origin()) + cnt, {nspin, nkpts, nkpts, nwalk * npol * nmo_max * nocca_max});
     GKaKjw_to_GKKwaj(nd, Gc, GKK, nocca_tot, noccb_tot, nmo_tot, nmo_max * nocca_max);
     comm->barrier();
 
     // one-body contribution
     // haj[ndet*nkpts][nocc*nmo]
-    // not parallelized for now, since it would require customization of Wfn
     if (addH1)
     {
-      // must use Gc since GKK is is SP
       int na = 0, nk = 0, nb = 0;
       for (int n = 0; n < nwalk; n++)
         E[n][0] = E0;
       for (int K = 0; K < nkpts; ++K)
       {
 #ifdef MIXED_PRECISION
-        boost::multi::array_ref<ComplexType, 2> haj_K(to_address(haj[nd * nkpts + K].origin()),
-                                                      {nelpk[nd][K], nopk[K]});
+        // must use Gc since GKK is is SP
+        boost::multi::array_ref<ComplexType, 3> haj_K(to_address(haj[nd * nkpts + K].origin()),
+                                                      {nelpk[nd][K], npol, nopk[K]});
         for (int a = 0; a < nelpk[nd][K]; ++a)
-          ma::product(ComplexType(1.), ma::T(G3Da[na + a].sliced(nk, nk + nopk[K])), haj_K[a], ComplexType(1.),
-                      E(E.extension(0), 0));
+          for (int pol = 0; pol < npol; ++pol)
+            ma::product(ComplexType(1.), ma::T(G3Da[(na + a)*npol+pol].sliced(nk, nk + nopk[K])), 
+                haj_K[a][pol], ComplexType(1.), E(E.extension(0), 0));
         na += nelpk[nd][K];
         if (walker_type == COLLINEAR)
         {
@@ -407,11 +408,12 @@ class KP3IndexFactorization
         }
         nk += nopk[K];
 #else
+        // use GKK
         nk = nopk[K];
         {
           na = nelpk[nd][K];
-          CVector_ref haj_K(to_address(haj[nd * nkpts + K].origin()), {na * nk});
-          SpMatrix_ref Gaj(to_address(GKK[0][K][K].origin()), {nwalk, na * nk});
+          CVector_ref haj_K(to_address(haj[nd * nkpts + K].origin()), {na * npol * nk});
+          SpMatrix_ref Gaj(to_address(GKK[0][K][K].origin()), {nwalk, na * npol * nk});
           ma::product(ComplexType(1.), Gaj, haj_K, ComplexType(1.), E(E.extension(0), 0));
         }
         if (walker_type == COLLINEAR)
@@ -425,12 +427,10 @@ class KP3IndexFactorization
       }
     }
 
-    // move calculation of H1 here
-    // NOTE: For CLOSED/NONCOLLINEAR, can do all walkers simultaneously to improve perf. of GEMM
-    //       Not sure how to do it for COLLINEAR.
     if (addEXX)
     {
-      size_t local_memory_needs = 2 * nwalk * nocca_max * nocca_max * nchol_max + 2 * nchol_max * nwalk;
+      size_t local_memory_needs = 2 * nwalk * nocca_max * nocca_max * nchol_max + 
+                                  2 * nchol_max * nwalk;
       if (TMats.num_elements() < local_memory_needs)
         TMats.reextent({local_memory_needs, 1});
       cnt = 0;
@@ -463,18 +463,21 @@ class KP3IndexFactorization
               int na    = nelpk[nd][Ka];
               int nk    = nopk[Kk];
 
-              SpMatrix_ref Gwal(GKK[0][Ka][Kl].origin(), {nwalk * na, nl});
-              SpMatrix_ref Gwbk(GKK[0][Kb][Kk].origin(), {nwalk * nb, nk});
-              SpMatrix_ref Lank(to_address(LQKank[nd * nspin * nkpts + Q][Ka].origin()), {na * nchol, nk});
+              SpMatrix_ref Gwal(GKK[0][Ka][Kl].origin(), {nwalk * na , npol * nl});
+              SpMatrix_ref Gwbk(GKK[0][Kb][Kk].origin(), {nwalk * nb , npol * nk});
+              SpMatrix_ref Lank(to_address(LQKank[nd * nspin * nkpts + Q][Ka].origin()), 
+                                                         {na * nchol, npol * nk});
               auto bnl_ptr(to_address(LQKank[nd * nspin * nkpts + Qm][Kb].origin()));
               if (Qmap[Q] > 0)
                 bnl_ptr = to_address(LQKbnl[nd * nspin * number_of_symmetric_Q + Qmap[Q] - 1][Kb].origin());
-              SpMatrix_ref Lbnl(bnl_ptr, {nb * nchol, nl});
+              SpMatrix_ref Lbnl(bnl_ptr, {nb * nchol, npol * nl});
 
               SpMatrix_ref Twban(TMats.origin() + cnt, {nwalk * nb, na * nchol});
               Sp4Tensor_ref T4Dwban(TMats.origin() + cnt, {nwalk, nb, na, nchol});
-              SpMatrix_ref Twabn(Twban.origin() + Twban.num_elements(), {nwalk * na, nb * nchol});
-              Sp4Tensor_ref T4Dwabn(Twban.origin() + Twban.num_elements(), {nwalk, na, nb, nchol});
+              SpMatrix_ref Twabn(Twban.origin() + Twban.num_elements(), 
+                                                       {nwalk * na, nb * nchol});
+              Sp4Tensor_ref T4Dwabn(Twban.origin() + Twban.num_elements(), 
+                                                       {nwalk, na, nb, nchol});
 
               if (na > 0 && nb > 0)
                 ma::product(Gwal, ma::T(Lbnl), Twabn);
@@ -1280,6 +1283,7 @@ class KP3IndexFactorization
            typename = typename std::enable_if_t<(std::decay<MatB>::type::dimensionality == 2)>>
   void vbias(const MatA& Gw, MatB&& v, double a = 1., double c = 0., int nd = 0)
   {
+    using std::copy_n;
     using GType = typename std::decay_t<typename MatA::element>;
     using vType = typename std::decay<MatB>::type::element;
     int nkpts   = nopk.size();
@@ -1288,6 +1292,7 @@ class KP3IndexFactorization
     assert(v.size(0) == 2 * local_nCV);
     assert(v.size(1) == nwalk);
     int nspin     = (walker_type == COLLINEAR ? 2 : 1);
+    int npol      = (walker_type == NONCOLLINEAR ? 2 : 1);
     int nmo_tot   = std::accumulate(nopk.begin(), nopk.end(), 0);
     int nmo_max   = *std::max_element(nopk.begin(), nopk.end());
     int nocca_tot = std::accumulate(nelpk[nd].begin(), nelpk[nd].begin() + nkpts, 0);
@@ -1306,12 +1311,14 @@ class KP3IndexFactorization
     SPComplexType minusimhalfa(0.0, -0.5 * a * scl);
     SPComplexType imhalfa(0.0, 0.5 * a * scl);
     size_t local_memory_needs = 2 * nchol_max * nwalk;
+    if(walker_type == NONCOLLINEAR)
+      local_memory_needs += nmo_max * npol * nwalk; // for transposed G
     if (TMats.num_elements() < local_memory_needs)
       TMats.reextent({local_memory_needs, 1});
     SpMatrix_ref vlocal(TMats.origin(), {2 * nchol_max, nwalk});
     std::fill_n(vlocal.origin(), vlocal.num_elements(), SPComplexType(0.0));
 
-    assert(Gw.num_elements() == nwalk * (nocca_tot + noccb_tot) * nmo_tot);
+    assert(Gw.num_elements() == nwalk * (nocca_tot + noccb_tot) * npol * nmo_tot);
     const_sp_pointer Gptr(nullptr);
     // I WANT C++17!!!!!!
     if (std::is_same<SPComplexType, GType>::value)
@@ -1331,7 +1338,7 @@ class KP3IndexFactorization
       comm->barrier();
     }
 
-    boost::multi::array_cref<SPComplexType, 3> G3Da(Gptr, {nocca_tot, nmo_tot, nwalk});
+    boost::multi::array_cref<SPComplexType, 4> G3Da(Gptr, {nocca_tot, npol, nmo_tot, nwalk});
     boost::multi::array_cref<SPComplexType, 3> G3Db(Gptr + G3Da.num_elements() * (nspin - 1),
                                                     {noccb_tot, nmo_tot, nwalk});
 
@@ -1343,7 +1350,7 @@ class KP3IndexFactorization
     }
     comm->barrier();
 
-    size_t nqk = 0; // start count at 1 to "offset" the calcuation of E1 done at root
+    size_t nqk = 0; 
     for (int K = 0; K < nkpts; ++K)
     { // K is the index of the kpoint pair of (a,k)
       for (int Q = 0; Q < nkpts; ++Q)
@@ -1361,11 +1368,23 @@ class KP3IndexFactorization
           int nk0   = std::accumulate(nopk.begin(), nopk.begin() + QKToK2[Q][K], 0);
           auto&& v1 = vlocal({0, nchol}, {0, nwalk});
 
-          Sp3Tensor_ref Lank(to_address(LQKank[nd * nspin * nkpts + Q][K].origin()), {na, nchol, nk});
-
-          // v1[Q][n][nw] += sum_K sum_a_k LQK[a][n][k] G[a][k][nw]
-          for (int a = 0; a < na; ++a)
-            ma::product(one, Lank[a], G3Da[na0 + a]({nk0, nk0 + nk}, {0, nwalk}), one, v1);
+          if (walker_type == NONCOLLINEAR) {
+            Sp3Tensor_ref Lank(to_address(LQKank[nd * nspin * nkpts + Q][K].origin()), {na, nchol, npol*nk});
+            // v1[Q][n][nw] += sum_K sum_a_p_k LQK[a][n][p][k] G[a][p][k][nw]
+            for (int a = 0; a < na; ++a) {
+              SpMatrix_ref Ga( to_address(vlocal.origin())+vlocal.num_elements(), {npol*nk,nwalk} );
+              SpMatrix_ref Ga_( Ga.origin(), {npol,nk*nwalk} );
+              for(int p=0; p<npol; p++)
+                copy_n(to_address(G3Da[na0 + a][p][nk0].origin()),nk*nwalk,Ga_[p].origin());
+              ma::product(one, Lank[a], Ga, one, v1);
+            }
+          } else {
+            Sp3Tensor_ref Lank(to_address(LQKank[nd * nspin * nkpts + Q][K].origin()), {na, nchol, nk});
+          
+            // v1[Q][n][nw] += sum_K sum_a_k LQK[a][n][k] G[a][k][nw]
+            for (int a = 0; a < na; ++a)
+              ma::product(one, Lank[a], G3Da[na0 + a][0]({nk0, nk0 + nk}, {0, nwalk}), one, v1);
+          }  
         }
         if (walker_type == COLLINEAR)
         {
@@ -1429,12 +1448,26 @@ class KP3IndexFactorization
             int nk0   = std::accumulate(nopk.begin(), nopk.begin() + QKToK2[Q][K], 0);
             auto&& v1 = vlocal({0, nchol}, {0, nwalk});
 
-            Sp3Tensor_ref Lbnl(to_address(LQKbnl[nd * nspin * number_of_symmetric_Q + Qmap[Q] - 1][K].origin()),
+            if (walker_type == NONCOLLINEAR) {
+              Sp3Tensor_ref Lbnl(to_address(LQKbnl[nd * nspin * number_of_symmetric_Q + Qmap[Q] - 1][K].origin()),
+                               {na, nchol, npol*nk});
+
+              // v1[Q][n][nw] += sum_K sum_a_s_k LQK[b][n][s][l] G[b][s][l][nw]
+              for (int a = 0; a < na; ++a) {
+                SpMatrix_ref Ga( to_address(vlocal.origin())+vlocal.num_elements(), {npol*nk,nwalk} );
+                SpMatrix_ref Ga_( Ga.origin(), {npol,nk*nwalk} );
+                for(int p=0; p<npol; p++)
+                  copy_n(to_address(G3Da[na0 + a][p][nk0].origin()),nk*nwalk,Ga_[p].origin());
+                ma::product(one, Lbnl[a], Ga, one, v1);
+              }  
+            } else {
+              Sp3Tensor_ref Lbnl(to_address(LQKbnl[nd * nspin * number_of_symmetric_Q + Qmap[Q] - 1][K].origin()),
                                {na, nchol, nk});
 
-            // v1[Q][n][nw] += sum_K sum_a_k LQK[b][n][l] G[b][l][nw]
-            for (int a = 0; a < na; ++a)
-              ma::product(one, Lbnl[a], G3Da[na0 + a]({nk0, nk0 + nk}, {0, nwalk}), one, v1);
+              // v1[Q][n][nw] += sum_K sum_a_k LQK[b][n][l] G[b][l][nw]
+              for (int a = 0; a < na; ++a)
+                ma::product(one, Lbnl[a], G3Da[na0 + a][0]({nk0, nk0 + nk}, {0, nwalk}), one, v1);
+            }
           }
           if (walker_type == COLLINEAR)
           {
@@ -1581,14 +1614,15 @@ class KP3IndexFactorization
   void GKaKjw_to_GKKwaj(int nd, MatA const& GKaKj, MatB&& GKKaj, int nocca_tot, int noccb_tot, int nmo_tot, int akmax)
   {
     int nspin = (walker_type == COLLINEAR ? 2 : 1);
+    int npol = (walker_type == NONCOLLINEAR ? 2 : 1);
     int nwalk = GKaKj.size(1);
     int nkpts = nopk.size();
-    assert(GKaKj.num_elements() == (nocca_tot + noccb_tot) * nmo_tot * nwalk);
-    assert(GKKaj.num_elements() == nspin * nkpts * nkpts * akmax * nwalk);
-    boost::multi::array_cref<ComplexType, 3> Gca(to_address(GKaKj.origin()), {nocca_tot, nmo_tot, nwalk});
+    assert(GKaKj.num_elements() == (nocca_tot + noccb_tot) * npol * nmo_tot * nwalk);
+    assert(GKKaj.num_elements() == nspin * nkpts * nkpts * npol * akmax * nwalk);
+    boost::multi::array_cref<ComplexType, 4> Gca(to_address(GKaKj.origin()), {nocca_tot, npol, nmo_tot, nwalk});
     boost::multi::array_cref<ComplexType, 3> Gcb(to_address(GKaKj.origin()) + Gca.num_elements(),
                                                  {noccb_tot, nmo_tot, nwalk});
-    boost::multi::array_ref<SPComplexType, 4> GKK(to_address(GKKaj.origin()), {nspin, nkpts, nkpts, nwalk * akmax});
+    boost::multi::array_ref<SPComplexType, 4> GKK(to_address(GKKaj.origin()), {nspin, nkpts, nkpts, nwalk * npol * akmax});
     int na0 = 0;
     for (int Ka = 0, Kaj = 0; Ka < nkpts; Ka++)
     {
@@ -1603,18 +1637,21 @@ class KP3IndexFactorization
           continue;
         }
         auto G_(to_address(GKK[0][Ka][Kj].origin()));
-        int naj = na * nj;
-        for (int a = 0, aj = 0; a < na; a++)
+        int naj = na * nj * npol;
+        for (int a = 0, asj = 0; a < na; a++)
         {
-          auto Gc_(to_address(Gca[na0 + a][nj0].origin()));
-          for (int j = 0; j < nj; j++, aj++)
+          for (int p = 0; p < npol; p++)
           {
-            for (int w = 0, waj = 0; w < nwalk; w++, ++Gc_, waj += naj)
+            auto Gc_(to_address(Gca[na0 + a][p][nj0].origin()));
+            for (int j = 0; j < nj; j++, asj++)
+            {
+              for (int w = 0, waj = 0; w < nwalk; w++, ++Gc_, waj += naj)
 #ifdef MIXED_PRECISION
-              G_[waj + aj] = static_cast<SPComplexType>(*Gc_);
+                G_[waj + asj] = static_cast<SPComplexType>(*Gc_);
 #else
-              G_[waj + aj] = (*Gc_);
+                G_[waj + asj] = (*Gc_);
 #endif
+            }
           }
         }
         nj0 += nj;
diff --git a/src/AFQMC/HamiltonianOperations/tests/test_hamiltonian_operations.cpp b/src/AFQMC/HamiltonianOperations/tests/test_hamiltonian_operations.cpp
index 63fa7c2251..066d3290d2 100644
--- a/src/AFQMC/HamiltonianOperations/tests/test_hamiltonian_operations.cpp
+++ b/src/AFQMC/HamiltonianOperations/tests/test_hamiltonian_operations.cpp
@@ -244,14 +244,17 @@ void ham_ops_basic_serial(boost::mpi3::communicator& world)
       HOps.vbias(Gw,X,sqrtdt);
     }
     TG.local_barrier();
-    ComplexType Xsum = 0;
-    for(int i=0; i<X.size(); i++)
+    ComplexType Xsum = 0, Xsum2=0;
+    for(int i=0; i<X.size(0); i++) {
       Xsum += X[i][0];
+      Xsum2 += 0.5*X[i][0]*X[i][0];
+    }
     if(std::abs(file_data.Xsum)>1e-8) {
       REQUIRE( real(Xsum) == Approx(real(file_data.Xsum)) );
       REQUIRE( imag(Xsum) == Approx(imag(file_data.Xsum)) );
     } else {
       app_log()<<" Xsum: " <<setprecision(12) <<Xsum <<std::endl;
+      app_log()<<" Xsum2 (EJ): " <<setprecision(12) <<Xsum2/sqrtdt/sqrtdt <<std::endl;
     }
 
     int vdim1 = (HOps.transposed_vHS()?1:NMO*NMO);
diff --git a/src/AFQMC/Hamiltonians/KPFactorizedHamiltonian.cpp b/src/AFQMC/Hamiltonians/KPFactorizedHamiltonian.cpp
index ff91b39dd4..27b9a7ee72 100644
--- a/src/AFQMC/Hamiltonians/KPFactorizedHamiltonian.cpp
+++ b/src/AFQMC/Hamiltonians/KPFactorizedHamiltonian.cpp
@@ -466,11 +466,12 @@ HamiltonianOperations KPFactorizedHamiltonian::getHamiltonianOperations_shared(b
         }
         else
         {
+          RealType scl = (type == CLOSED ? 2.0 : 1.0);
           auto Psi = get_PsiK<boost::multi::array<ComplexType, 2>>(nmo_per_kp, PsiT[nd], K, npol==2);
           assert(Psi.size(0) == na);
           boost::multi::array_ref<ComplexType, 2> haj_r(to_address(haj[nd * nkpts + K].origin()), {na, npol*ni});
           if (na > 0)
-            ma::product(ComplexType(2.0), Psi, H1[K]({0, npol*ni}, {0, npol*ni}), ComplexType(0.0), haj_r);
+            ma::product(ComplexType(scl), Psi, H1[K]({0, npol*ni}, {0, npol*ni}), ComplexType(0.0), haj_r);
         }
       }
     }
@@ -536,13 +537,13 @@ HamiltonianOperations KPFactorizedHamiltonian::getHamiltonianOperations_shared(b
             if (Q <= Qm)
             {
               Sp3Tensor_ref Likn(to_address(LQKikn[Q][K].origin()), {ni, nk, nchol});
-              Sp3Tensor_ref Lank(to_address(LQKank[nq0 + Q][K].origin()), {na*npol, nchol, nk});
+              Sp3Tensor_ref Lank(to_address(LQKank[nq0 + Q][K].origin()), {na, nchol, npol*nk});
               ma_rotate::getLank(Psi, Likn, Lank, buff, npol==2);
             }
             else
             {
               Sp3Tensor_ref Lkin(to_address(LQKikn[Qm][QK].origin()), {nk, ni, nchol});
-              Sp3Tensor_ref Lank(to_address(LQKank[nq0 + Q][K].origin()), {na*npol, nchol, nk});
+              Sp3Tensor_ref Lank(to_address(LQKank[nq0 + Q][K].origin()), {na, nchol, npol*nk});
               ma_rotate::getLank_from_Lkin(Psi, Lkin, Lank, buff, npol==2);
             }
           }
@@ -590,7 +591,7 @@ HamiltonianOperations KPFactorizedHamiltonian::getHamiltonianOperations_shared(b
           {
             auto PsiQK = get_PsiK<SpMatrix>(nmo_per_kp, PsiT[nd], QK, npol==2);
             assert(PsiQK.size(0) == na);
-            Sp3Tensor_ref Lbnl(to_address(LQKbnl[nq0 + Qmap[Q] - 1][QK].origin()), {na*npol, nchol, ni});
+            Sp3Tensor_ref Lbnl(to_address(LQKbnl[nq0 + Qmap[Q] - 1][QK].origin()), {na, nchol, npol*ni});
             ma_rotate::getLank_from_Lkin(PsiQK, Likn, Lbnl, buff, npol==2);
           }
         }
@@ -1231,11 +1232,12 @@ HamiltonianOperations KPFactorizedHamiltonian::getHamiltonianOperations_batched(
         }
         else
         {
+          RealType scl = (type == CLOSED ? 2.0 : 1.0);
           auto Psi = get_PsiK<boost::multi::array<ComplexType, 2>>(nmo_per_kp, PsiT[nd], K, npol==2);
           assert(Psi.size(0) == na);
           boost::multi::array_ref<ComplexType, 2> haj_r(to_address(haj[nd * nkpts + K].origin()), {nocc_max, npol*nmo_max});
           if (na > 0)
-            ma::product(ComplexType(2.0), Psi, H1[K]({0, npol*ni}, {0, npol*ni}), ComplexType(0.0), haj_r({0, na}, {0, npol*ni}));
+            ma::product(ComplexType(scl), Psi, H1[K]({0, npol*ni}, {0, npol*ni}), ComplexType(0.0), haj_r({0, na}, {0, npol*ni}));
         }
       }
     }
@@ -1303,15 +1305,15 @@ HamiltonianOperations KPFactorizedHamiltonian::getHamiltonianOperations_batched(
             if (Q <= Qm)
             {
               Sp3Tensor_ref Likn(to_address(LQKikn[Q][K].origin()), {nmo_max, nmo_max, nchol_max});
-              Sp3Tensor_ref Lakn(to_address(LQKakn[nq0 + Q][K].origin()), {npol*nocc_max, nmo_max, nchol_max});
-              Sp3Tensor_ref Lank(to_address(LQKank[nq0 + Q][K].origin()), {npol*nocc_max, nchol_max, nmo_max});
+              Sp3Tensor_ref Lakn(to_address(LQKakn[nq0 + Q][K].origin()), {nocc_max, npol*nmo_max, nchol_max});
+              Sp3Tensor_ref Lank(to_address(LQKank[nq0 + Q][K].origin()), {nocc_max, nchol_max, npol*nmo_max});
               ma_rotate_padded::getLakn_Lank(Psi, Likn, Lakn, Lank, npol==2);
             }
             else
             {
               Sp3Tensor_ref Lkin(to_address(LQKikn[Qm][QK].origin()), {nmo_max, nmo_max, nchol_max});
-              Sp3Tensor_ref Lakn(to_address(LQKakn[nq0 + Q][K].origin()), {npol*nocc_max, nmo_max, nchol_max});
-              Sp3Tensor_ref Lank(to_address(LQKank[nq0 + Q][K].origin()), {npol*nocc_max, nchol_max, nmo_max});
+              Sp3Tensor_ref Lakn(to_address(LQKakn[nq0 + Q][K].origin()), {nocc_max, npol*nmo_max, nchol_max});
+              Sp3Tensor_ref Lank(to_address(LQKank[nq0 + Q][K].origin()), {nocc_max, nchol_max, npol*nmo_max});
               ma_rotate_padded::getLakn_Lank_from_Lkin(Psi, Lkin, Lakn, Lank, buff, npol==2);
             }
           }
@@ -1359,8 +1361,8 @@ HamiltonianOperations KPFactorizedHamiltonian::getHamiltonianOperations_batched(
           {
             auto PsiQK = get_PsiK<SpMatrix>(nmo_per_kp, PsiT[nd], QK, npol==2);
             assert(PsiQK.size(0) == na);
-            Sp3Tensor_ref Lbnl(to_address(LQKbnl[nq0 + Qmap[Q] - 1][QK].origin()), {npol*nocc_max, nchol_max, nmo_max});
-            Sp3Tensor_ref Lbln(to_address(LQKbln[nq0 + Qmap[Q] - 1][QK].origin()), {npol*nocc_max, nmo_max, nchol_max});
+            Sp3Tensor_ref Lbnl(to_address(LQKbnl[nq0 + Qmap[Q] - 1][QK].origin()), {nocc_max, nchol_max, npol*nmo_max});
+            Sp3Tensor_ref Lbln(to_address(LQKbln[nq0 + Qmap[Q] - 1][QK].origin()), {nocc_max, npol*nmo_max, nchol_max});
             ma_rotate_padded::getLakn_Lank_from_Lkin(PsiQK, Likn, Lbln, Lbnl, buff, npol==2);
           }
         }
diff --git a/src/AFQMC/SlaterDeterminantOperations/SlaterDetOperations_base.hpp b/src/AFQMC/SlaterDeterminantOperations/SlaterDetOperations_base.hpp
index 78fe074d40..949404693e 100644
--- a/src/AFQMC/SlaterDeterminantOperations/SlaterDetOperations_base.hpp
+++ b/src/AFQMC/SlaterDeterminantOperations/SlaterDetOperations_base.hpp
@@ -235,31 +235,42 @@ class SlaterDetOperations_base
   }
 
   template<class Mat, class MatP1, class MatV>
-  void Propagate(Mat&& A, const MatP1& P1, const MatV& V, int order = 6, char TA = 'N')
+  void Propagate(Mat&& A, const MatP1& P1, const MatV& V, 
+                        int order = 6, char TA = 'N', bool noncollinear = false)
   {
+    int npol = noncollinear ? 2 : 1; 
     int NMO  = A.size(0);
     int NAEA = A.size(1);
+    int M = NMO/npol;
+    assert(NMO%npol == 0);  
+    assert(P1.size(0) == NMO);
+    assert(P1.size(1) == NMO);
+    assert(V.size(0) == M);
+    assert(V.size(1) == M);
     TMatrix TMN({NMO, NAEA}, buffer_generator->template get_allocator<T>());
-    TMatrix T1({NMO, NAEA}, buffer_generator->template get_allocator<T>());
-    TMatrix T2({NMO, NAEA}, buffer_generator->template get_allocator<T>());
+    TMatrix T1({M, NAEA}, buffer_generator->template get_allocator<T>());
+    TMatrix T2({M, NAEA}, buffer_generator->template get_allocator<T>());
     using ma::H;
     using ma::T;
     if (TA == 'H' || TA == 'h')
     {
       ma::product(ma::H(P1), std::forward<Mat>(A), TMN);
-      SlaterDeterminantOperations::base::apply_expM(V, TMN, T1, T2, order, TA);
+      for(int p=0; p<npol; ++p)  
+        SlaterDeterminantOperations::base::apply_expM(V, TMN.sliced(p*M,(p+1)*M), T1, T2, order, TA);
       ma::product(ma::H(P1), TMN, std::forward<Mat>(A));
     }
     else if (TA == 'T' || TA == 't')
     {
       ma::product(ma::T(P1), std::forward<Mat>(A), TMN);
-      SlaterDeterminantOperations::base::apply_expM(V, TMN, T1, T2, order, TA);
+      for(int p=0; p<npol; ++p)  
+        SlaterDeterminantOperations::base::apply_expM(V, TMN.sliced(p*M,(p+1)*M), T1, T2, order, TA);
       ma::product(ma::T(P1), TMN, std::forward<Mat>(A));
     }
     else
     {
       ma::product(P1, std::forward<Mat>(A), TMN);
-      SlaterDeterminantOperations::base::apply_expM(V, TMN, T1, T2, order);
+      for(int p=0; p<npol; ++p)  
+        SlaterDeterminantOperations::base::apply_expM(V, TMN.sliced(p*M,(p+1)*M), T1, T2, order);
       ma::product(P1, TMN, std::forward<Mat>(A));
     }
   }
diff --git a/src/AFQMC/SlaterDeterminantOperations/SlaterDetOperations_serial.hpp b/src/AFQMC/SlaterDeterminantOperations/SlaterDetOperations_serial.hpp
index 0341cd8172..fca3c36ae2 100644
--- a/src/AFQMC/SlaterDeterminantOperations/SlaterDetOperations_serial.hpp
+++ b/src/AFQMC/SlaterDeterminantOperations/SlaterDetOperations_serial.hpp
@@ -136,7 +136,7 @@ class SlaterDetOperations_serial : public SlaterDetOperations_base<Type, buffer_
   }
 
   template<class MatA, class MatP1, class MatV>
-  void BatchedPropagate(std::vector<MatA>& Ai, const MatP1& P1, const MatV& V, int order = 6, char TA = 'N')
+  void BatchedPropagate(std::vector<MatA>& Ai, const MatP1& P1, const MatV& V, int order = 6, char TA = 'N', bool noncollinear = false)
   {
     static_assert(pointedType<MatA>::dimensionality == 2, " dimenionality == 2");
     static_assert(std::decay<MatV>::type::dimensionality == 3, " dimenionality == 3");
@@ -144,35 +144,59 @@ class SlaterDetOperations_serial : public SlaterDetOperations_base<Type, buffer_
       return;
     assert(Ai.size() == V.size(0));
     int nbatch = Ai.size();
+    int npol = noncollinear ? 2 : 1;
     int NMO    = (*Ai[0]).size(0);
     int NAEA   = (*Ai[0]).size(1);
+    int M = NMO/npol;
+    assert(NMO%npol == 0);
+    assert(P1.size(0) == NMO);
+    assert(P1.size(1) == NMO);
+    assert(V.size(1) == M);
+    assert(V.size(2) == M);
     TTensor TMN({nbatch, NMO, NAEA}, buffer_generator->template get_allocator<T>());
     TTensor T1({nbatch, NMO, NAEA}, buffer_generator->template get_allocator<T>());
     TTensor T2({nbatch, NMO, NAEA}, buffer_generator->template get_allocator<T>());
-    using ma::H;
-    using ma::T;
     // could be batched when csrmm is batched
     if (TA == 'H' || TA == 'h')
     {
       for (int ib = 0; ib < nbatch; ib++)
         ma::product(ma::H(P1), *Ai[ib], TMN[ib]);
-      SlaterDeterminantOperations::batched::apply_expM(V, TMN, T1, T2, order, TA);
-      for (int ib = 0; ib < nbatch; ib++)
-        ma::product(ma::H(P1), TMN[ib], *Ai[ib]);
     }
     else if (TA == 'T' || TA == 't')
     {
       for (int ib = 0; ib < nbatch; ib++)
         ma::product(ma::T(P1), *Ai[ib], TMN[ib]);
+    }
+    else
+    {
+      for (int ib = 0; ib < nbatch; ib++)
+        ma::product(P1, *(Ai[ib]), TMN[ib]);
+    }
+   
+    // Apply V
+    if(noncollinear) { 
+      // treat 2 polarizations as separate elements in the batch
+      using TTensor_ref = boost::multi::array_ref<T, 3, decltype(TMN.origin())>;
+      TTensor_ref TMN_( TMN.origin(), {npol*nbatch, M, NAEA}); 
+      TTensor_ref T1_( T1.origin(), {npol*nbatch, M, NAEA}); 
+      TTensor_ref T2_( T2.origin(), {npol*nbatch, M, NAEA}); 
+      SlaterDeterminantOperations::batched::apply_expM(V, TMN_, T1_, T2_, order, TA);
+    } else {
       SlaterDeterminantOperations::batched::apply_expM(V, TMN, T1, T2, order, TA);
+    } 
+
+    if (TA == 'H' || TA == 'h')
+    {
+      for (int ib = 0; ib < nbatch; ib++)
+        ma::product(ma::H(P1), TMN[ib], *Ai[ib]);
+    }
+    else if (TA == 'T' || TA == 't')
+    {
       for (int ib = 0; ib < nbatch; ib++)
         ma::product(ma::T(P1), TMN[ib], *Ai[ib]);
     }
     else
     {
-      for (int ib = 0; ib < nbatch; ib++)
-        ma::product(P1, *(Ai[ib]), TMN[ib]);
-      SlaterDeterminantOperations::batched::apply_expM(V, TMN, T1, T2, order);
       for (int ib = 0; ib < nbatch; ib++)
         ma::product(P1, TMN[ib], *Ai[ib]);
     }
diff --git a/src/AFQMC/SlaterDeterminantOperations/SlaterDetOperations_shared.hpp b/src/AFQMC/SlaterDeterminantOperations/SlaterDetOperations_shared.hpp
index 2e39bb1123..f205d38dc8 100644
--- a/src/AFQMC/SlaterDeterminantOperations/SlaterDetOperations_shared.hpp
+++ b/src/AFQMC/SlaterDeterminantOperations/SlaterDetOperations_shared.hpp
@@ -159,15 +159,23 @@ class SlaterDetOperations_shared : public SlaterDetOperations_base<T, host_alloc
   }
 
   template<class Mat, class MatP1, class MatV>
-  void Propagate(Mat&& A, const MatP1& P1, const MatV& V, communicator& comm, int order = 6, char TA = 'N')
+  void Propagate(Mat&& A, const MatP1& P1, const MatV& V, communicator& comm, 
+                                int order = 6, char TA = 'N', bool noncollinear = false)
   {
+    int npol = noncollinear ? 2 : 1;
     int NMO  = A.size(0);
     int NAEA = A.size(1);
-    set_shm_buffer(comm, 3 * NAEA * NMO);
-    assert(SM_TMats->num_elements() >= 3 * NAEA * NAEA);
+    int M = NMO/npol;
+    assert(NMO%npol == 0);
+    assert(P1.size(0) == NMO);
+    assert(P1.size(1) == NMO);
+    assert(V.size(0) == M);
+    assert(V.size(1) == M);
+    set_shm_buffer(comm, NAEA * (NMO + 2*M) );
+    assert(SM_TMats->num_elements() >= NAEA * (NMO + 2*M) );
     boost::multi::array_ref<T, 2> T0(to_address(SM_TMats->origin()), {NMO, NAEA});
-    boost::multi::array_ref<T, 2> T1(to_address(SM_TMats->origin()) + NMO * NAEA, {NMO, NAEA});
-    boost::multi::array_ref<T, 2> T2(to_address(SM_TMats->origin()) + 2 * NMO * NAEA, {NMO, NAEA});
+    boost::multi::array_ref<T, 2> T1(to_address(T0.origin()) + T0.num_elements(), {M, NAEA});
+    boost::multi::array_ref<T, 2> T2(to_address(T1.origin()) + T1.num_elements(), {M, NAEA});
     using ma::H;
     using ma::T;
     if (comm.root())
@@ -180,7 +188,8 @@ class SlaterDetOperations_shared : public SlaterDetOperations_base<T, host_alloc
         ma::product(P1, std::forward<Mat>(A), T0);
     }
     comm.barrier();
-    SlaterDeterminantOperations::shm::apply_expM(V, T0, T1, T2, comm, order, TA);
+    for(int p=0; p<npol; ++p)
+      SlaterDeterminantOperations::shm::apply_expM(V, T0.sliced(p*M,(p+1)*M), T1, T2, comm, order, TA);
     comm.barrier();
     if (comm.root())
     {
@@ -230,7 +239,7 @@ class SlaterDetOperations_shared : public SlaterDetOperations_base<T, host_alloc
   }
 
   template<class MatA, class MatP1, class MatV>
-  void BatchedPropagate(std::vector<MatA>& Ai, const MatP1& P1, const MatV& V, int order = 6, char TA = 'N')
+  void BatchedPropagate(std::vector<MatA>& Ai, const MatP1& P1, const MatV& V, int order = 6, char TA = 'N', bool noncollinear=false)
   {
     APP_ABORT(" Error: Batched routines not compatible with SlaterDetOperations_shared::BatchedPropagate \n");
   }
diff --git a/src/AFQMC/SlaterDeterminantOperations/apply_expM.hpp b/src/AFQMC/SlaterDeterminantOperations/apply_expM.hpp
index 24871376ac..ea1dd664b5 100644
--- a/src/AFQMC/SlaterDeterminantOperations/apply_expM.hpp
+++ b/src/AFQMC/SlaterDeterminantOperations/apply_expM.hpp
@@ -34,7 +34,7 @@ namespace base
  * Calculate S = exp(im*V)*S using a Taylor expansion of exp(V)
  */
 template<class MatA, class MatB, class MatC>
-inline void apply_expM(const MatA& V, MatB& S, MatC& T1, MatC& T2, int order = 6, char TA = 'N')
+inline void apply_expM(const MatA& V, MatB&& S, MatC& T1, MatC& T2, int order = 6, char TA = 'N')
 {
   assert(V.size(0) == V.size(1));
   assert(V.size(1) == S.size(0));
@@ -80,7 +80,7 @@ namespace shm
  * V, S, T1, T2 are expected to be in shared memory.  
  */
 template<class MatA, class MatB, class MatC, class communicator>
-inline void apply_expM(const MatA& V, MatB& S, MatC& T1, MatC& T2, communicator& comm, int order = 6, char TA = 'N')
+inline void apply_expM(const MatA& V, MatB&& S, MatC& T1, MatC& T2, communicator& comm, int order = 6, char TA = 'N')
 {
   assert(V.size(0) == S.size(0));
   assert(V.size(1) == S.size(0));
@@ -134,13 +134,14 @@ namespace batched
  * Calculate S = exp(im*V)*S using a Taylor expansion of exp(V)
  */
 template<class MatA, class MatB, class MatC>
-inline void apply_expM(const MatA& V, MatB& S, MatC& T1, MatC& T2, int order = 6, char TA = 'N')
+inline void apply_expM(const MatA& V, MatB&& S, MatC& T1, MatC& T2, 
+                       int order = 6, char TA = 'N')
 {
   static_assert(std::decay<MatA>::type::dimensionality == 3, " batched::apply_expM::dimenionality == 3");
   static_assert(std::decay<MatB>::type::dimensionality == 3, " batched::apply_expM::dimenionality == 3");
   static_assert(std::decay<MatC>::type::dimensionality == 3, " batched::apply_expM::dimenionality == 3");
-  assert(V.size(0) == S.size(0));
-  assert(V.size(0) == T1.size(0));
+  assert(V.size(0) == S.size(0)); 
+  assert(V.size(0) == T1.size(0)); 
   assert(V.size(0) == T2.size(0));
   assert(V.size(1) == V.size(2));
   assert(V.size(2) == S.size(1));
diff --git a/src/AFQMC/SlaterDeterminantOperations/rotate.hpp b/src/AFQMC/SlaterDeterminantOperations/rotate.hpp
index 86c98c02b8..cdf85807d4 100644
--- a/src/AFQMC/SlaterDeterminantOperations/rotate.hpp
+++ b/src/AFQMC/SlaterDeterminantOperations/rotate.hpp
@@ -617,8 +617,8 @@ void halfRotateCholeskyMatrix(WALKER_TYPES type,
  * Case:
  *   - Closed/Collinear:  L[a][n][k] = sum_i A[a][i] L[i][k][n]
  *       - In collinear case, two separate calls are made for each spin channel.
- *   - Non-collinear: L[a][s][n][k] = sum_i A[a][si] L[i][k][n]   // [si] == [s][i] combined spinor index
- *       - In this case, to preserve matrix dimenions, [a][s] --> [as] is kept as a single index, peculiar order is given by its use in Energy evaluation. 
+ *   - Non-collinear: L[a][n][sk] = sum_i A[a][si] L[i][k][n]   // [si] == [s][i] combined spinor index
+ *       - In this case, to preserve matrix dimenions, [s][k] --> [sk] is kept as a single index.
  */
 template<class MultiArray2DA, class MultiArray3DB, class MultiArray3DC, class MultiArray2D>
 void getLank(MultiArray2DA&& Aai, MultiArray3DB&& Likn, MultiArray3DC&& Lank, MultiArray2D&& buff, bool noncollinear=false)
@@ -631,10 +631,10 @@ void getLank(MultiArray2DA&& Aai, MultiArray3DB&& Likn, MultiArray3DC&& Lank, Mu
   int nk    = Likn.size(1);
   int nchol = Likn.size(2);
   assert(Likn.size(0) == ni);
-  assert(Lank.size(0) == na*npol);
+  assert(Lank.size(0) == na);
   assert(Lank.size(1) == nchol);
-  assert(Lank.size(2) == nk);
-  assert(buff.size(0) >= nk);
+  assert(Lank.size(2) == nk*npol);
+  assert(buff.size(0) >= npol * nk);
   assert(buff.size(1) >= nchol);
   if(noncollinear)
     assert(Aai.stride(0) == Aai.size(1)); // make sure it is contiguous
@@ -646,12 +646,12 @@ void getLank(MultiArray2DA&& Aai, MultiArray3DB&& Likn, MultiArray3DC&& Lank, Mu
   boost::multi::array_ref<element, 2> Las_kn(to_address(Lank.origin()), {na * npol, nk * nchol});
 
   ma::product(Aas_i, Li_kn, Las_kn);
-  for (int a = 0; a < na * npol; a++)
+  for (int a = 0; a < na; a++)
   {
-    boost::multi::array_ref<element, 2> Lkn(to_address(Lank[a].origin()), {nk, nchol});
-    boost::multi::array_ref<element, 2> Lnk(to_address(Lank[a].origin()), {nchol, nk});
-    buff({0, nk}, {0, nchol}) = Lkn;
-    ma::transpose(buff({0, nk}, {0, nchol}), Lnk);
+    boost::multi::array_ref<element, 2> Lskn(to_address(Lank[a].origin()), {npol * nk, nchol});
+    boost::multi::array_ref<element, 2> Lnsk(to_address(Lank[a].origin()), {nchol, npol * nk});
+    buff({0, npol * nk}, {0, nchol}) = Lskn;
+    ma::transpose(buff({0, npol * nk}, {0, nchol}), Lnsk);
   }
 }
 
@@ -660,8 +660,8 @@ void getLank(MultiArray2DA&& Aai, MultiArray3DB&& Likn, MultiArray3DC&& Lank, Mu
  * Case:
  *   - Closed/Collinear:  L[a][n][k] = sum_i A[a][i] conj(L[k][i][n])
  *       - In collinear case, two separate calls are made for each spin channel.
- *   - Non-collinear: L[a][s][n][k] = sum_i A[a][si] conj(L[k][i][n])   // [si] == [s][i] combined spinor index
- *       - In this case, to preserve matrix dimenions, [a][s] --> [as] is kept as a single index, peculiar order is given by its use in Energy evaluation. 
+ *   - Non-collinear: L[a][n][sk] = sum_i A[a][si] conj(L[k][i][n])   // [si] == [s][i] combined spinor index
+ *       - In this case, to preserve matrix dimenions, [s][k] --> [sk] is kept as a single index.
  */
 template<class MultiArray2DA, class MultiArray3DB, class MultiArray3DC, class MultiArray2D>
 void getLank_from_Lkin(MultiArray2DA&& Aai, MultiArray3DB&& Lkin, MultiArray3DC&& Lank, MultiArray2D&& buff, bool noncollinear=false)
@@ -674,9 +674,9 @@ void getLank_from_Lkin(MultiArray2DA&& Aai, MultiArray3DB&& Lkin, MultiArray3DC&
   int nk    = Lkin.size(0);
   int nchol = Lkin.size(2);
   assert(Lkin.size(1) == ni);
-  assert(Lank.size(0) == na * npol);
+  assert(Lank.size(0) == na);
   assert(Lank.size(1) == nchol);
-  assert(Lank.size(2) == nk);
+  assert(Lank.size(2) == nk * npol);
   assert(buff.num_elements() >= na * npol * nchol);
   if(noncollinear)
     assert(Aai.stride(0) == Aai.size(1)); // make sure it is contiguous
@@ -684,15 +684,16 @@ void getLank_from_Lkin(MultiArray2DA&& Aai, MultiArray3DB&& Lkin, MultiArray3DC&
   using Type = typename std::decay<MultiArray3DC>::type::element;
   using elementA = typename std::decay<MultiArray2DA>::type::element;
   boost::multi::array_ref<elementA, 2> Aas_i(to_address(Aai.origin()), {na * npol, ni});
-  boost::multi::array_ref<Type, 2> bna(to_address(buff.origin()), {nchol, na * npol});
+  boost::multi::array_ref<Type, 2> bnas(to_address(buff.origin()), {nchol, na * npol});
   // Lank[a][n][k] = sum_i Aai[a][i] conj(Lkin[k][i][n])
   // Lank[as][n][k] = sum_i Aai[as][i] conj(Lkin[k][i][n])
   for (int k = 0; k < nk; k++)
   {
-    ma::product(ma::H(Lkin[k]), ma::T(Aas_i), bna);
-    for (int a = 0; a < na * npol; a++)
+    ma::product(ma::H(Lkin[k]), ma::T(Aas_i), bnas);
+    for (int a = 0; a < na; a++)
       for (int n = 0; n < nchol; n++)
-        Lank[a][n][k] = bna[n][a];
+        for (int p = 0; p < npol; p++)
+          Lank[a][n][p*nk+k] = bnas[n][a*npol+p];
   }
 }
 
@@ -714,13 +715,13 @@ void getLakn_Lank(MultiArray2DA&& Aai, MultiArray3DB&& Likn, MultiArray3DC&& Lak
   int nchol = Likn.size(2);
   assert(Likn.size(1) == nmo);
 
-  assert(Lakn.size(1) == nmo);
+  assert(Lakn.size(1) == npol*nmo);
   assert(Lakn.size(2) == nchol);
 
-  assert(Lakn.size(0) >= npol*na );
+  assert(Lakn.size(0) >= na );
   assert(Lakn.size(0) == Lank.size(0));
   assert(Lank.size(1) == nchol);
-  assert(Lank.size(2) == nmo);
+  assert(Lank.size(2) == npol*nmo);
 
   if(noncollinear)
     assert(Aai.stride(0) == Aai.size(1)); // make sure it is contiguous
@@ -756,13 +757,13 @@ void getLakn_Lank_from_Lkin(MultiArray2DA&& Aai,
   int nchol = Lkin.size(2);
   assert(Lkin.size(1) == nmo);
 
-  assert(Lakn.size(1) == nmo);
+  assert(Lakn.size(1) == npol*nmo);
   assert(Lakn.size(2) == nchol);
 
-  assert(Lakn.size(0) >= npol*na );
+  assert(Lakn.size(0) >= na );
   assert(Lakn.size(0) == Lank.size(0));
   assert(Lank.size(1) == nchol);
-  assert(Lank.size(2) == nmo);
+  assert(Lank.size(2) == npol*nmo);
 
   if(noncollinear)
     assert(Aai.stride(0) == Aai.size(1)); // make sure it is contiguous
@@ -774,15 +775,16 @@ void getLakn_Lank_from_Lkin(MultiArray2DA&& Aai,
   using elmA = typename std::decay<MultiArray2DA>::type::element;
 
   boost::multi::array_ref<elmA, 2> Aas_i(to_address(Aai.origin()), {na * npol, ni});
-  boost::multi::array_ref<elm2, 2, ptr2> bna(buff.origin(), {nchol, na * npol});
-  // Lakn[as][k][n] = sum_i Aai[as][i] conj(Lkin[k][i][n])
+  boost::multi::array_ref<elm2, 2, ptr2> bnas(buff.origin(), {nchol, na * npol});
+  // Lakn[a][sk][n] = sum_i Aai[as][i] conj(Lkin[k][i][n])
   for (int k = 0; k < nmo; k++)
   {
-    ma::product(ma::H(Lkin[k].sliced(0, ni)), ma::T(Aai), bna);
-    for (int a = 0; a < na * npol; a++)
-      Lakn[a][k] = bna({0, nchol}, a);
+    ma::product(ma::H(Lkin[k].sliced(0, ni)), ma::T(Aai), bnas);
+    for (int a = 0; a < na; a++)
+      for (int p = 0; p < npol; p++)
+        Lakn[a][p*nmo+k] = bnas({0, nchol}, a*npol+p);
   }
-  for (int a = 0; a < na * npol; a++)
+  for (int a = 0; a < na; a++)
     ma::transpose(Lakn[a], Lank[a]);
 }
 
diff --git a/src/AFQMC/Wavefunctions/NOMSD.hpp b/src/AFQMC/Wavefunctions/NOMSD.hpp
index 4cb29a64d6..d92a1b2399 100644
--- a/src/AFQMC/Wavefunctions/NOMSD.hpp
+++ b/src/AFQMC/Wavefunctions/NOMSD.hpp
@@ -753,7 +753,7 @@ class NOMSD : public AFQMCInfo
       return (full) ? (2 * NMO * NMO) : ((NAEA + NAEB) * NMO);
       break;
     case NONCOLLINEAR:
-      return (full) ? (4 * NMO * NMO) : ((NAEA + NAEB) * 2 * NMO);
+      return (full) ? (4 * NMO * NMO) : (NAEA * 2 * NMO);
       break;
     default:
       APP_ABORT(" Error: Unknown walker_type in dm_size. \n");
@@ -773,7 +773,7 @@ class NOMSD : public AFQMCInfo
       return (full) ? (arr{NMO, NMO}) : ((sp == Alpha) ? (arr{NAEA, NMO}) : (arr{NAEB, NMO}));
       break;
     case NONCOLLINEAR:
-      return (full) ? (arr{2 * NMO, 2 * NMO}) : (arr{NAEA + NAEB, 2 * NMO});
+      return (full) ? (arr{2 * NMO, 2 * NMO}) : (arr{NAEA, 2 * NMO});
       break;
     default:
       APP_ABORT(" Error: Unknown walker_type in dm_size. \n");
diff --git a/src/AFQMC/Wavefunctions/WavefunctionFactory.cpp b/src/AFQMC/Wavefunctions/WavefunctionFactory.cpp
index 6c78c82bd1..78f1d6c01d 100644
--- a/src/AFQMC/Wavefunctions/WavefunctionFactory.cpp
+++ b/src/AFQMC/Wavefunctions/WavefunctionFactory.cpp
@@ -88,6 +88,9 @@ Wavefunction WavefunctionFactory::fromASCII(TaskGroup_& TGprop,
   int NMO  = AFinfo.NMO;
   int NAEA = AFinfo.NAEA;
   int NAEB = AFinfo.NAEB;
+  int NPOL = (walker_type == NONCOLLINEAR) ? 2 : 1;
+  if( (walker_type == NONCOLLINEAR) && (NAEB != 0) )
+    APP_ABORT(" Error in Wavefunctions/WavefunctionFactory::fromASCII: noncollinear && NAEB!=0. \n\n\n "); 
 
   std::ifstream in;
   in.open(filename.c_str());
@@ -114,7 +117,7 @@ Wavefunction WavefunctionFactory::fromASCII(TaskGroup_& TGprop,
           read_ph_wavefunction(in, ndets_to_read, walker_type, TGwfn.Node(), NMO, NAEA, NAEB, PsiT_MO);
       assert(abij.number_of_configurations() == ndets_to_read);
       int NEL = (walker_type == NONCOLLINEAR) ? (NAEA + NAEB) : NAEA;
-      int N_  = (walker_type == NONCOLLINEAR) ? 2 * NMO : NMO;
+      int N_  = NPOL * NMO;
       ComplexType one(1.0, 0.0);
       if (walker_type == COLLINEAR)
         PsiT.reserve(2 * ndets_to_read);
@@ -290,14 +293,14 @@ Wavefunction WavefunctionFactory::fromASCII(TaskGroup_& TGprop,
     auto guess = initial_guess.find(name);
     if (guess == initial_guess.end())
     {
-      auto newg = initial_guess.insert(std::make_pair(name, boost::multi::array<ComplexType, 3>({2, NMO, NAEA})));
+      auto newg = initial_guess.insert(std::make_pair(name, boost::multi::array<ComplexType, 3>({2, NPOL*NMO, NAEA})));
       int iC    = (walker_type != COLLINEAR ? initial_configuration : 2 * initial_configuration);
       if (iC >= PsiT.size())
         APP_ABORT(" Error: initial_configuration > ndets_to_read \n");
       if (!newg.second)
         APP_ABORT(" Error: Problems adding new initial guess. \n");
       using ma::conj;
-      std::fill_n((newg.first)->second.origin(), 2 * NMO * NAEA, ComplexType(0.0, 0.0));
+      std::fill_n((newg.first)->second.origin(), 2 * NPOL * NMO * NAEA, ComplexType(0.0, 0.0));
       {
         auto pbegin = PsiT[iC].pointers_begin();
         auto pend   = PsiT[iC].pointers_end();
@@ -329,22 +332,15 @@ Wavefunction WavefunctionFactory::fromASCII(TaskGroup_& TGprop,
 
     if (TGwfn.TG_local().size() > 1)
     {
-      SlaterDetOperations SDetOp(
-          SlaterDetOperations_shared<ComplexType>(((walker_type != NONCOLLINEAR) ? (NMO) : (2 * NMO)),
-                                                  ((walker_type != NONCOLLINEAR) ? (NAEA) : (NAEA + NAEB))));
+      SlaterDetOperations SDetOp(SlaterDetOperations_shared<ComplexType>(NPOL*NMO,NAEA));
       return Wavefunction(NOMSD<devcsr_Matrix>(AFinfo, cur, TGwfn, std::move(SDetOp), std::move(HOps), std::move(ci),
                                                std::move(PsiT), walker_type, NCE, targetNW));
     }
     else
     {
       SlaterDetOperations SDetOp(
-          SlaterDetOperations_serial<ComplexType, device_allocator_generator_type>(((walker_type != NONCOLLINEAR)
-                                                                                        ? (NMO)
-                                                                                        : (2 * NMO)),
-                                                                                   ((walker_type != NONCOLLINEAR)
-                                                                                        ? (NAEA)
-                                                                                        : (NAEA + NAEB)),
-                                                                                   device_buffer_generator.get()));
+          SlaterDetOperations_serial<ComplexType, device_allocator_generator_type>(
+                NPOL*NMO,NAEA,device_buffer_generator.get()));
       return Wavefunction(NOMSD<devcsr_Matrix>(AFinfo, cur, TGwfn, std::move(SDetOp), std::move(HOps), std::move(ci),
                                                std::move(PsiT), walker_type, NCE, targetNW));
     }
@@ -362,6 +358,7 @@ Wavefunction WavefunctionFactory::fromASCII(TaskGroup_& TGprop,
      */
 
     // assuming walker_type==COLLINEAR for now, specialize a type for perfect pairing PHMSD
+// MAM: generatlize for NONCOLLINEAR later!!!
     if (walker_type != COLLINEAR)
       APP_ABORT("Error: PHMSD requires a COLLINEAR calculation.\n");
     std::vector<PsiT_Matrix> PsiT_MO;
@@ -717,6 +714,9 @@ Wavefunction WavefunctionFactory::fromHDF5(TaskGroup_& TGprop,
   int NMO  = AFinfo.NMO;
   int NAEA = AFinfo.NAEA;
   int NAEB = AFinfo.NAEB;
+  int NPOL = (walker_type == NONCOLLINEAR) ? 2 : 1;
+  if( (walker_type == NONCOLLINEAR) && (NAEB != 0) )
+    APP_ABORT(" Error in Wavefunctions/WavefunctionFactory::fromASCII: noncollinear && NAEB!=0. \n\n\n ");
 
   std::vector<int> excitations;
 
@@ -832,9 +832,7 @@ Wavefunction WavefunctionFactory::fromHDF5(TaskGroup_& TGprop,
 
     if (TGwfn.TG_local().size() > 1)
     {
-      SlaterDetOperations SDetOp(
-          SlaterDetOperations_shared<ComplexType>(((walker_type != NONCOLLINEAR) ? (NMO) : (2 * NMO)),
-                                                  ((walker_type != NONCOLLINEAR) ? (NAEA) : (NAEA + NAEB))));
+      SlaterDetOperations SDetOp(SlaterDetOperations_shared<ComplexType>(NPOL*NMO,NAEA));
       if (dense_trial == "yes")
       {
         using MType = ComplexMatrix<node_allocator<ComplexType>>;
@@ -858,13 +856,8 @@ Wavefunction WavefunctionFactory::fromHDF5(TaskGroup_& TGprop,
     else
     {
       SlaterDetOperations SDetOp(
-          SlaterDetOperations_serial<ComplexType, device_allocator_generator_type>(((walker_type != NONCOLLINEAR)
-                                                                                        ? (NMO)
-                                                                                        : (2 * NMO)),
-                                                                                   ((walker_type != NONCOLLINEAR)
-                                                                                        ? (NAEA)
-                                                                                        : (NAEA + NAEB)),
-                                                                                   device_buffer_generator.get()));
+          SlaterDetOperations_serial<ComplexType, device_allocator_generator_type>(
+                NPOL*NMO,NAEA,device_buffer_generator.get())); 
       if (dense_trial == "yes")
       {
         using MType = ComplexMatrix<node_allocator<ComplexType>>;
@@ -1125,6 +1118,7 @@ void WavefunctionFactory::getInitialGuess(hdf_archive& dump,
                                           int NAEB,
                                           WALKER_TYPES walker_type)
 {
+  int NPOL = (walker_type==NONCOLLINEAR) ? 2 : 1;
   std::vector<int> dims(5);
   if (!dump.readEntry(dims, "dims"))
   {
@@ -1135,19 +1129,19 @@ void WavefunctionFactory::getInitialGuess(hdf_archive& dump,
   auto guess = initial_guess.find(name);
   if (guess == initial_guess.end())
   {
-    auto newg = initial_guess.insert(std::make_pair(name, boost::multi::array<ComplexType, 3>({2, NMO, NAEA})));
+    auto newg = initial_guess.insert(std::make_pair(name, boost::multi::array<ComplexType, 3>({2, NPOL*NMO, NAEA})));
     if (!newg.second)
       APP_ABORT(" Error: Problems adding new initial guess. \n");
     using ma::conj;
-    std::fill_n((newg.first)->second.origin(), 2 * NMO * NAEA, ComplexType(0.0, 0.0));
+    std::fill_n((newg.first)->second.origin(), 2 * NPOL * NMO * NAEA, ComplexType(0.0, 0.0));
     {
-      boost::multi::array<ComplexType, 2> Psi0Alpha({NMO, NAEA});
+      boost::multi::array<ComplexType, 2> Psi0Alpha({NPOL*NMO, NAEA});
       if (!dump.readEntry(Psi0Alpha, "Psi0_alpha"))
       {
         app_error() << " Error in WavefunctionFactory: Initial wavefunction Psi0_alpha not found. \n";
         APP_ABORT("");
       }
-      for (int i = 0; i < NMO; i++)
+      for (int i = 0; i < NPOL*NMO; i++)
         for (int j = 0; j < NAEA; j++)
           ((newg.first)->second)[0][i][j] = Psi0Alpha[i][j];
     }
diff --git a/src/AFQMC/Wavefunctions/tests/test_wfn_factory.cpp b/src/AFQMC/Wavefunctions/tests/test_wfn_factory.cpp
index bed417e64e..d39ac3dcaa 100644
--- a/src/AFQMC/Wavefunctions/tests/test_wfn_factory.cpp
+++ b/src/AFQMC/Wavefunctions/tests/test_wfn_factory.cpp
@@ -86,6 +86,7 @@ void wfn_fac(boost::mpi3::communicator& world)
     int NAEA             = file_data.NAEA;
     int NAEB             = file_data.NAEB;
     WALKER_TYPES type    = afqmc::getWalkerType(UTEST_WFN);
+    int NPOL             = (type == NONCOLLINEAR) ? 2 : 1;
 
     std::map<std::string, AFQMCInfo> InfoMap;
     InfoMap.insert(std::pair<std::string, AFQMCInfo>("info0", AFQMCInfo{"info0", NMO, NAEA, NAEB}));
@@ -160,7 +161,7 @@ void wfn_fac(boost::mpi3::communicator& world)
       WalkerSet wset(TG, doc3.getRoot(), InfoMap["info0"], &rng);
       auto initial_guess = WfnFac.getInitialGuess(wfn_name);
       REQUIRE(initial_guess.size(0) == 2);
-      REQUIRE(initial_guess.size(1) == NMO);
+      REQUIRE(initial_guess.size(1) == NPOL*NMO);
       REQUIRE(initial_guess.size(2) == NAEA);
 
       if (type == COLLINEAR)
@@ -298,7 +299,7 @@ void wfn_fac(boost::mpi3::communicator& world)
       WalkerSet wset2(TG, doc3.getRoot(), InfoMap["info0"], &rng);
       //auto initial_guess = WfnFac.getInitialGuess(wfn_name);
       REQUIRE(initial_guess.size(0) == 2);
-      REQUIRE(initial_guess.size(1) == NMO);
+      REQUIRE(initial_guess.size(1) == NPOL*NMO);
       REQUIRE(initial_guess.size(2) == NAEA);
 
       if (type == COLLINEAR)
@@ -431,6 +432,7 @@ void wfn_fac_distributed(boost::mpi3::communicator& world, int ngroups)
     int NAEA             = file_data.NAEA;
     int NAEB             = file_data.NAEB;
     WALKER_TYPES type    = afqmc::getWalkerType(UTEST_WFN);
+    int NPOL             = (type == NONCOLLINEAR) ? 2 : 1;
 
     std::map<std::string, AFQMCInfo> InfoMap;
     InfoMap.insert(std::pair<std::string, AFQMCInfo>("info0", AFQMCInfo{"info0", NMO, NAEA, NAEB}));
@@ -503,7 +505,7 @@ void wfn_fac_distributed(boost::mpi3::communicator& world, int ngroups)
     WalkerSet wset(TG, doc3.getRoot(), InfoMap["info0"], &rng);
     auto initial_guess = WfnFac.getInitialGuess(wfn_name);
     REQUIRE(initial_guess.size(0) == 2);
-    REQUIRE(initial_guess.size(1) == NMO);
+    REQUIRE(initial_guess.size(1) == NPOL*NMO);
     REQUIRE(initial_guess.size(2) == NAEA);
 
     if (type == COLLINEAR)
@@ -662,7 +664,7 @@ void wfn_fac_distributed(boost::mpi3::communicator& world, int ngroups)
     WalkerSet wset2(TG, doc3.getRoot(), InfoMap["info0"], &rng);
     //auto initial_guess = WfnFac.getInitialGuess(wfn_name);
     REQUIRE(initial_guess.size(0) == 2);
-    REQUIRE(initial_guess.size(1) == NMO);
+    REQUIRE(initial_guess.size(1) == NPOL*NMO);
     REQUIRE(initial_guess.size(2) == NAEA);
 
     if (type == COLLINEAR)

From 41afd00404d7fd17bd888bad423d5cb300d98cfa Mon Sep 17 00:00:00 2001
From: mmorale3 <miguel.mmorales@gmail.com>
Date: Tue, 15 Sep 2020 17:56:01 -0700
Subject: [PATCH 06/10] fixed noncollienar issues in SlaterDetOps,
 BasePropagator and KP3 HOps

---
 .../KP3IndexFactorization.hpp                 | 96 ++++++++++++++-----
 src/AFQMC/Propagators/AFQMCBasePropagator.cpp |  1 +
 src/AFQMC/Propagators/AFQMCBasePropagator.icc | 37 ++++---
 .../tests/test_propagator_factory.cpp         | 11 ++-
 .../SlaterDetOperations_serial.hpp            |  4 +-
 src/AFQMC/Wavefunctions/NOMSD.icc             |  2 +
 6 files changed, 97 insertions(+), 54 deletions(-)

diff --git a/src/AFQMC/HamiltonianOperations/KP3IndexFactorization.hpp b/src/AFQMC/HamiltonianOperations/KP3IndexFactorization.hpp
index 3f95758b0b..b5db2c8d2d 100644
--- a/src/AFQMC/HamiltonianOperations/KP3IndexFactorization.hpp
+++ b/src/AFQMC/HamiltonianOperations/KP3IndexFactorization.hpp
@@ -179,9 +179,7 @@ class KP3IndexFactorization
   {
     int nkpts = nopk.size();
     int NMO   = std::accumulate(nopk.begin(), nopk.end(), 0);
-    // in non-collinear case with SO, keep SO matrix here and add it
-    // for now, stay collinear
-    boost::multi::array<ComplexType, 2> P1({NMO, NMO});
+    int npol = (walker_type == NONCOLLINEAR) ? 2 : 1;
 
     // making a copy of vMF since it will be modified
     shmCVector vMF_(iextensions<1u>{vMF.num_elements()}, shared_allocator<ComplexType>{*comm});
@@ -193,45 +191,91 @@ class KP3IndexFactorization
     }
     comm->barrier();
 
-    boost::multi::array_ref<ComplexType, 1> P1D(to_address(P1.origin()), {NMO * NMO});
-    std::fill_n(P1D.origin(), P1D.num_elements(), ComplexType(0));
-    vHS(vMF_, P1D);
+    boost::multi::array<ComplexType, 2> P0({NMO, NMO});
+    boost::multi::array_ref<ComplexType, 1> P0D(to_address(P0.origin()), {P0.num_elements()});
+    std::fill_n(P0D.origin(), P0D.num_elements(), ComplexType(0));
+    vHS(vMF_, P0D);
     if (TG.TG().size() > 1)
-      TG.TG().all_reduce_in_place_n(P1D.origin(), P1D.num_elements(), std::plus<>());
+      TG.TG().all_reduce_in_place_n(P0D.origin(), P0D.num_elements(), std::plus<>());
 
-    // add H1 + vn0 and symmetrize
-    // MAM: intel 19 doesn't accept this using statement, I'm forced to
-    //      call ma::conj explicitly even though I should not need to
-    using ma::conj;
+    boost::multi::array<ComplexType, 2> P1({npol*NMO, npol*NMO});
+    std::fill_n(P1.origin(), P1.num_elements(), ComplexType(0.0));
 
+    // add spin-dependent H1
     for (int K = 0, nk0 = 0; K < nkpts; ++K)
     {
       for (int i = 0, I = nk0; i < nopk[K]; i++, I++)
       {
-        P1[I][I] += H1[K][i][i] + vn0[K][i][i];
+        for(int p=0; p<npol; ++p)
+          P1[p*NMO+I][p*NMO+I] += H1[K][p*nopk[K]+i][p*nopk[K]+i];
         for (int j = i + 1, J = I + 1; j < nopk[K]; j++, J++)
         {
-          P1[I][J] += H1[K][i][j] + vn0[K][i][j];
-          P1[J][I] += H1[K][j][i] + vn0[K][j][i];
-          // This is really cutoff dependent!!!
-#if MIXED_PRECISION
-          if (std::abs(P1[I][J] - ma::conj(P1[J][I])) * 2.0 > 1e-5)
+          for(int p=0; p<npol; ++p)
           {
-#else
-          if (std::abs(P1[I][J] - ma::conj(P1[J][I])) * 2.0 > 1e-6)
+              P1[p*NMO+I][p*NMO+J] += H1[K][p*nopk[K]+i][p*nopk[K]+j];
+              P1[p*NMO+J][p*NMO+I] += H1[K][p*nopk[K]+j][p*nopk[K]+i]; 
+          }
+        }
+        if(walker_type == NONCOLLINEAR) {
+          // offdiagonal piece
+          for (int j = 0, J = nk0; j < nopk[K]; j++, J++)
           {
-#endif
-            app_error() << " WARNING in getOneBodyPropagatorMatrix. H1 is not hermitian. \n";
-            app_error() << I << " " << J << " " << P1[I][J] << " " << P1[J][I] << " " << H1[K][i][j] << " "
-                        << H1[K][j][i] << " " << vn0[K][i][j] << " " << vn0[K][j][i] << std::endl;
-            //APP_ABORT("Error in getOneBodyPropagatorMatrix. H1 is not hermitian. \n");
+            P1[I][NMO+J] += H1[K][i][nopk[K]+j];
+            P1[NMO+J][I] += H1[K][nopk[K]+j][i];      
           }
-          P1[I][J] = 0.5 * (P1[I][J] + ma::conj(P1[J][I]));
-          P1[J][I] = ma::conj(P1[I][J]);
         }
       }
       nk0 += nopk[K];
     }
+    
+    // add P0 (diagonal in spin)
+    for(int p=0; p<npol; ++p)
+      for(int I=0; I<NMO; I++)
+        for(int J=0; J<NMO; J++)
+          P1[p*NMO+I][p*NMO+J] += P0[I][J];
+    
+    // add vn0 (diagonal in spin)
+    for (int K = 0, nk0 = 0; K < nkpts; ++K)
+    {
+      for (int i = 0, I = nk0; i < nopk[K]; i++, I++)
+      {
+        for(int p=0; p<npol; ++p)
+          P1[p*NMO+I][p*NMO+I] += vn0[K][i][i];
+        for (int j = i + 1, J = I + 1; j < nopk[K]; j++, J++)
+        {
+          for(int p=0; p<npol; ++p)
+          {
+            P1[p*NMO+I][p*NMO+J] += vn0[K][i][j];
+            P1[p*NMO+J][p*NMO+I] += vn0[K][j][i];
+          }
+        }
+      }
+      nk0 += nopk[K];
+    }
+    
+    using ma::conj;
+    // symmetrize
+    for(int I=0; I<npol*NMO; I++)
+    { 
+      for(int J=I+1; J<npol*NMO; J++)
+      {  
+        // This is really cutoff dependent!!!
+#if MIXED_PRECISION
+        if (std::abs(P1[I][J] - ma::conj(P1[J][I])) * 2.0 > 1e-5)
+        {
+#else
+        if (std::abs(P1[I][J] - ma::conj(P1[J][I])) * 2.0 > 1e-6)
+        {
+#endif
+          app_error() << " WARNING in getOneBodyPropagatorMatrix. H1 is not hermitian. \n";
+          app_error() << I << " " << J << " " << P1[I][J] << " " << P1[J][I] <<std::endl; 
+                      //<< H1[K][i][j] << " "
+                      //<< H1[K][j][i] << " " << vn0[K][i][j] << " " << vn0[K][j][i] << std::endl;
+        }
+        P1[I][J] = 0.5 * (P1[I][J] + ma::conj(P1[J][I]));
+        P1[J][I] = ma::conj(P1[I][J]);
+      }
+    }
     return P1;
   }
 
diff --git a/src/AFQMC/Propagators/AFQMCBasePropagator.cpp b/src/AFQMC/Propagators/AFQMCBasePropagator.cpp
index 7f86cd3329..025da33315 100644
--- a/src/AFQMC/Propagators/AFQMCBasePropagator.cpp
+++ b/src/AFQMC/Propagators/AFQMCBasePropagator.cpp
@@ -140,6 +140,7 @@ void AFQMCBasePropagator::parse(xmlNodePtr cur)
                 << "\n";
   }
 
+  // MAM: make this more generic, needs changes for noncollinear
   if (external_field != std::string(""))
   {
     //    read_external_field(H1ext);
diff --git a/src/AFQMC/Propagators/AFQMCBasePropagator.icc b/src/AFQMC/Propagators/AFQMCBasePropagator.icc
index 240044c249..dae4acafdb 100644
--- a/src/AFQMC/Propagators/AFQMCBasePropagator.icc
+++ b/src/AFQMC/Propagators/AFQMCBasePropagator.icc
@@ -43,6 +43,7 @@ void AFQMCBasePropagator::step(int nsteps_, WlkSet& wset, RealType Eshift, RealT
 
   AFQMCTimers[setup_timer]->start();
   auto walker_type = wset.getWalkerType();
+  int npol = (walker_type==NONCOLLINEAR) ? 2 : 1;
   int nsteps       = nsteps_;
   int nwalk        = wset.size();
   RealType sqrtdt  = std::sqrt(dt);
@@ -265,6 +266,7 @@ void AFQMCBasePropagator::BackPropagate(int nbpsteps, int nStabalize, WlkSet& ws
 {
   using std::copy_n;
   auto walker_type = wset.getWalkerType();
+  int npol = (walker_type==NONCOLLINEAR) ? 2 : 1;
   int nwalk        = wset.size();
   int globalnCV    = wfn.global_number_of_cholesky_vectors();
 
@@ -285,7 +287,7 @@ void AFQMCBasePropagator::BackPropagate(int nbpsteps, int nStabalize, WlkSet& ws
   assert(Fields.size(1) == globalnCV);
   assert(Fields.size(2) == nwalk);
 
-  int nrow(NMO * ((walker_type == NONCOLLINEAR) ? 2 : 1));
+  int nrow(NMO*npol); 
   int ncol(NAEA + ((walker_type == CLOSED) ? 0 : NAEB));
   assert(Refs.size(0) == nwalk);
   int nrefs = Refs.size(1);
@@ -416,6 +418,7 @@ void AFQMCBasePropagator::apply_propagators(char TA,
 {
   int nwalk        = wset.size();
   auto walker_type = wset.getWalkerType();
+  bool noncol = (walker_type == NONCOLLINEAR);
 
   int spin(0);
   if (spin_dependent_P1)
@@ -455,28 +458,20 @@ void AFQMCBasePropagator::apply_propagators(char TA,
       for (int tk = tk0; tk < tkN; ++tk)
       {
         int nt = ni * nwalk + tk;
-        SDetOp->Propagate(*wset[tk].SlaterMatrix(Alpha), P1[0], vHS3D[nt], order, TA);
+        SDetOp->Propagate(*wset[tk].SlaterMatrix(Alpha), P1[0], vHS3D[nt], order, TA, noncol);
       }
       if (last_nextra > 0)
       {
         int iw = ntasks_total_serial + last_task_index;
         int nt = ni * nwalk + iw;
-        SDetOp->Propagate(*wset[iw].SlaterMatrix(Alpha), P1[0], vHS3D[nt], local_group_comm, order, TA);
+        SDetOp->Propagate(*wset[iw].SlaterMatrix(Alpha), P1[0], vHS3D[nt], local_group_comm, order, TA, noncol);
       }
     }
   }
   else
   {
-    if (walker_type == NONCOLLINEAR)
-    {
-      if (local_vHS.size(0) != 2 * NMO || local_vHS.size(1) != 2 * NMO)
-        local_vHS = std::move(CMatrix({2 * NMO, 2 * NMO}));
-    }
-    else
-    {
-      if (local_vHS.size(0) != NMO || local_vHS.size(1) != NMO)
-        local_vHS = std::move(CMatrix({NMO, NMO}));
-    }
+    if (local_vHS.size(0) != NMO || local_vHS.size(1) != NMO)
+      local_vHS = std::move(CMatrix({NMO, NMO}));
     // vHS3D[M][M][nstep*nwalk]: need temporary buffer in this case
     if (walker_type == COLLINEAR)
     {
@@ -514,14 +509,14 @@ void AFQMCBasePropagator::apply_propagators(char TA,
         int nt    = ni * nwalk + tk;
         local_vHS = vHS3D(local_vHS.extension(0), local_vHS.extension(1), nt);
         //std::cout<<" pp: " <<tk <<" " <<ma::sum(local_vHS) <<"\n" <<std::endl;
-        SDetOp->Propagate(*wset[tk].SlaterMatrix(Alpha), P1[0], local_vHS, order, TA);
+        SDetOp->Propagate(*wset[tk].SlaterMatrix(Alpha), P1[0], local_vHS, order, TA, noncol);
       }
       if (last_nextra > 0)
       {
         int iw    = ntasks_total_serial + last_task_index;
         int nt    = ni * nwalk + iw;
         local_vHS = vHS3D(local_vHS.extension(0), local_vHS.extension(1), nt);
-        SDetOp->Propagate(*wset[iw].SlaterMatrix(Alpha), P1[0], local_vHS, local_group_comm, order, TA);
+        SDetOp->Propagate(*wset[iw].SlaterMatrix(Alpha), P1[0], local_vHS, local_group_comm, order, TA, noncol);
       }
     }
   }
@@ -533,6 +528,7 @@ void AFQMCBasePropagator::apply_propagators_batched(char TA, WSet& wset, int ni,
 {
   int nwalk        = wset.size();
   auto walker_type = wset.getWalkerType();
+  bool noncol = (walker_type == NONCOLLINEAR);
   int nbatch       = std::min(nwalk, (nbatched_propagation < 0 ? nwalk : nbatched_propagation));
 
   int spin(0);
@@ -556,7 +552,7 @@ void AFQMCBasePropagator::apply_propagators_batched(char TA, WSet& wset, int ni,
       Ai.clear();
       for (int ni = 0; ni < nb; ni++)
         Ai.emplace_back(wset[iw + ni].SlaterMatrix(Alpha));
-      SDetOp->BatchedPropagate(Ai, P1[0], vHS3D.sliced(nt, nt + nb), order, TA);
+      SDetOp->BatchedPropagate(Ai, P1[0], vHS3D.sliced(nt, nt + nb), order, TA, noncol);
       if (walker_type == COLLINEAR)
       {
         Ai.clear();
@@ -568,13 +564,12 @@ void AFQMCBasePropagator::apply_propagators_batched(char TA, WSet& wset, int ni,
   }
   else
   {
-    int sz = (walker_type == NONCOLLINEAR ? 2 * NMO : NMO);
-    if (local_vHS.size(0) != nbatch || local_vHS.size(1) != sz * sz)
-      local_vHS = std::move(CMatrix({nbatch, sz * sz}));
+    if (local_vHS.size(0) != nbatch || local_vHS.size(1) != NMO * NMO)
+      local_vHS = std::move(CMatrix({nbatch, NMO * NMO}));
     // vHS3D[M][M][nstep*nwalk]: need temporary buffer in this case
     int N2 = vHS3D.size(0) * vHS3D.size(1);
     CMatrix_ref vHS2D(vHS3D.origin(), {N2, vHS3D.size(2)});
-    C3Tensor_ref local3D(local_vHS.origin(), {nbatch, sz, sz});
+    C3Tensor_ref local3D(local_vHS.origin(), {nbatch, NMO, NMO});
     int nt = ni * nwalk;
     for (int iw = 0; iw < nwalk; iw += nbatch, nt += nbatch)
     {
@@ -583,7 +578,7 @@ void AFQMCBasePropagator::apply_propagators_batched(char TA, WSet& wset, int ni,
       Ai.clear();
       for (int ni = 0; ni < nb; ni++)
         Ai.emplace_back(wset[iw + ni].SlaterMatrix(Alpha));
-      SDetOp->BatchedPropagate(Ai, P1[0], local3D.sliced(0, nb), order, TA);
+      SDetOp->BatchedPropagate(Ai, P1[0], local3D.sliced(0, nb), order, TA, noncol);
       if (walker_type == COLLINEAR)
       {
         Ai.clear();
diff --git a/src/AFQMC/Propagators/tests/test_propagator_factory.cpp b/src/AFQMC/Propagators/tests/test_propagator_factory.cpp
index 77cd3e6944..4ca73a30fe 100644
--- a/src/AFQMC/Propagators/tests/test_propagator_factory.cpp
+++ b/src/AFQMC/Propagators/tests/test_propagator_factory.cpp
@@ -78,6 +78,8 @@ void propg_fac_shared(boost::mpi3::communicator& world)
 
     int NMO, NAEA, NAEB;
     std::tie(NMO, NAEA, NAEB) = read_info_from_hdf(UTEST_HAMIL);
+    WALKER_TYPES type                = afqmc::getWalkerType(UTEST_WFN);
+    int NPOL = (type == NONCOLLINEAR) ?  2 : 1;
 
     std::map<std::string, AFQMCInfo> InfoMap;
     InfoMap.insert(std::pair<std::string, AFQMCInfo>("info0", AFQMCInfo{"info0", NMO, NAEA, NAEB}));
@@ -104,7 +106,6 @@ void propg_fac_shared(boost::mpi3::communicator& world)
     // initialize TG buffer
     make_localTG_buffer_generator(TG.TG_local(), 20 * 1024L * 1024L);
 
-    WALKER_TYPES type                = afqmc::getWalkerType(UTEST_WFN);
     const char* wlk_xml_block_closed = "<WalkerSet name=\"wset0\">  \
       <parameter name=\"walker_type\">closed</parameter>  \
     </WalkerSet> \
@@ -148,7 +149,7 @@ void propg_fac_shared(boost::mpi3::communicator& world)
     WalkerSet wset(TG, doc3.getRoot(), InfoMap["info0"], &rng);
     auto initial_guess = WfnFac.getInitialGuess(wfn_name);
     REQUIRE(initial_guess.size(0) == 2);
-    REQUIRE(initial_guess.size(1) == NMO);
+    REQUIRE(initial_guess.size(1) == NPOL*NMO);
     REQUIRE(initial_guess.size(2) == NAEA);
     wset.resize(nwalk, initial_guess[0], initial_guess[0]);
     //                         initial_guess[1](XXX.extension(0),{0,NAEB}));
@@ -165,7 +166,6 @@ void propg_fac_shared(boost::mpi3::communicator& world)
     Propagator& prop = PropgFac.getPropagator(TG, prop_name, wfn, &rng);
 
     std::cout << setprecision(12);
-    std::cout << "energy " << std::endl;
     wfn.Energy(wset);
     {
       ComplexType eav = 0, ov = 0;
@@ -260,6 +260,8 @@ void propg_fac_distributed(boost::mpi3::communicator& world, int ngrp)
 
     int NMO, NAEA, NAEB;
     std::tie(NMO, NAEA, NAEB) = read_info_from_hdf(UTEST_HAMIL);
+    WALKER_TYPES type                = afqmc::getWalkerType(UTEST_WFN);
+    int NPOL = (type == NONCOLLINEAR) ?  2 : 1;
 
     std::map<std::string, AFQMCInfo> InfoMap;
     InfoMap.insert(std::pair<std::string, AFQMCInfo>("info0", AFQMCInfo{"info0", NMO, NAEA, NAEB}));
@@ -289,7 +291,6 @@ void propg_fac_distributed(boost::mpi3::communicator& world, int ngrp)
     // initialize TG buffer
     make_localTG_buffer_generator(TG.TG_local(), 20 * 1024L * 1024L);
 
-    WALKER_TYPES type                = afqmc::getWalkerType(UTEST_WFN);
     const char* wlk_xml_block_closed = "<WalkerSet name=\"wset0\">  \
       <parameter name=\"walker_type\">closed</parameter>  \
     </WalkerSet> \
@@ -330,7 +331,7 @@ void propg_fac_distributed(boost::mpi3::communicator& world, int ngrp)
     WalkerSet wset(TG, doc3.getRoot(), InfoMap["info0"], &rng);
     auto initial_guess = WfnFac.getInitialGuess(wfn_name);
     REQUIRE(initial_guess.size(0) == 2);
-    REQUIRE(initial_guess.size(1) == NMO);
+    REQUIRE(initial_guess.size(1) == NPOL*NMO);
     REQUIRE(initial_guess.size(2) == NAEA);
     wset.resize(nwalk, initial_guess[0], initial_guess[0]);
 
diff --git a/src/AFQMC/SlaterDeterminantOperations/SlaterDetOperations_serial.hpp b/src/AFQMC/SlaterDeterminantOperations/SlaterDetOperations_serial.hpp
index fca3c36ae2..73ff2e0ba6 100644
--- a/src/AFQMC/SlaterDeterminantOperations/SlaterDetOperations_serial.hpp
+++ b/src/AFQMC/SlaterDeterminantOperations/SlaterDetOperations_serial.hpp
@@ -127,12 +127,12 @@ class SlaterDetOperations_serial : public SlaterDetOperations_base<Type, buffer_
   }
 
   template<class Mat, class MatP1, class MatV>
-  void Propagate(Mat&& A, const MatP1& P1, const MatV& V, communicator& comm, int order = 6, char TA = 'N')
+  void Propagate(Mat&& A, const MatP1& P1, const MatV& V, communicator& comm, int order = 6, char TA = 'N', bool noncollinear = false)
   {
 #if defined(ENABLE_CUDA) || defined(ENABLE_HIP)
     APP_ABORT(" Error: SlaterDetOperations_serial should not be here. \n");
 #endif
-    Base::Propagate(std::forward<Mat>(A), P1, V, order, TA);
+    Base::Propagate(std::forward<Mat>(A), P1, V, order, TA, noncollinear);
   }
 
   template<class MatA, class MatP1, class MatV>
diff --git a/src/AFQMC/Wavefunctions/NOMSD.icc b/src/AFQMC/Wavefunctions/NOMSD.icc
index 5ad40ac542..e970967620 100644
--- a/src/AFQMC/Wavefunctions/NOMSD.icc
+++ b/src/AFQMC/Wavefunctions/NOMSD.icc
@@ -2314,6 +2314,8 @@ void NOMSD<devPsiT>::vMF(Vec&& v)
 template<class devPsiT>
 inline void NOMSD<devPsiT>::recompute_ci()
 {
+  if (walker_type == NONCOLLINEAR)
+    APP_ABORT(" Error: NOMSD::recompute_ci not implemented with NONCOLLINEAR.\n");
   double LogOverlapFactor(0.0);
   int ndets = ci.size();
   int nspin(1);

From cc45a76609a2a870f157121821962525efac9fed Mon Sep 17 00:00:00 2001
From: mmorale3 <miguel.mmorales@gmail.com>
Date: Wed, 16 Sep 2020 09:39:44 -0700
Subject: [PATCH 07/10] fixed issues on gpu with recent noncollinear changes,
 all unit tests pass non-collinear KP input

---
 .../boost_multi/multi/memory/allocator.hpp    |   3 +-
 .../KP3IndexFactorization_batched.hpp         | 147 ++++++++++++------
 .../Hamiltonians/KPFactorizedHamiltonian.cpp  |   4 +-
 .../detail/CUDA/Kernels/KaKjw_to_KKwaj.cu     |  58 ++++---
 .../detail/CUDA/Kernels/KaKjw_to_KKwaj.cuh    |   6 +
 .../detail/CUDA/Kernels/KaKjw_to_QKajw.cu     |  52 ++++---
 .../detail/CUDA/Kernels/KaKjw_to_QKajw.cuh    |   6 +
 .../detail/HIP/Kernels/KaKjw_to_KKwaj.hip.cpp |  56 ++++---
 .../detail/HIP/Kernels/KaKjw_to_KKwaj.hip.h   |   6 +
 .../detail/HIP/Kernels/KaKjw_to_QKajw.hip.cpp |  52 ++++---
 .../detail/HIP/Kernels/KaKjw_to_QKajw.hip.h   |   6 +
 src/AFQMC/Numerics/tensor_operations.hpp      |  44 ++++--
 .../Numerics/tests/test_tensor_operations.cpp |   4 +-
 .../SlaterDetOperations_serial.hpp            |   2 +-
 .../apply_expM.hpp                            |  84 ++++++++++
 .../SlaterDeterminantOperations/rotate.hpp    |   2 +-
 16 files changed, 374 insertions(+), 158 deletions(-)

diff --git a/external_codes/boost_multi/multi/memory/allocator.hpp b/external_codes/boost_multi/multi/memory/allocator.hpp
index d10f9d86f5..2e93733cb4 100644
--- a/external_codes/boost_multi/multi/memory/allocator.hpp
+++ b/external_codes/boost_multi/multi/memory/allocator.hpp
@@ -49,7 +49,8 @@ class allocator{
 	bool operator!=(allocator const& o) const{return mp_ != o.mp_;}
 	using void_pointer = typename std::pointer_traits<decltype(std::declval<memory_type*>()->allocate(0, 0))>::template rebind<void>;
 	pointer allocate(size_type n){
-		return static_cast<pointer>(static_cast<void_pointer>(mp_->allocate(n*sizeof(value_type), alignof(T))));
+		return static_cast<pointer>(static_cast<void_pointer>(mp_->allocate(n*sizeof(value_type), 16)));
+		//return static_cast<pointer>(static_cast<void_pointer>(mp_->allocate(n*sizeof(value_type), alignof(T))));
 	}
 	void deallocate(pointer p, size_type n){
 		mp_->deallocate(p, n*sizeof(value_type));
diff --git a/src/AFQMC/HamiltonianOperations/KP3IndexFactorization_batched.hpp b/src/AFQMC/HamiltonianOperations/KP3IndexFactorization_batched.hpp
index 270071a365..5d20c41780 100644
--- a/src/AFQMC/HamiltonianOperations/KP3IndexFactorization_batched.hpp
+++ b/src/AFQMC/HamiltonianOperations/KP3IndexFactorization_batched.hpp
@@ -82,6 +82,8 @@ class KP3IndexFactorization_batched
   using CMatrix_cref  = boost::multi::array_ref<ComplexType const, 2, const_pointer>;
   using CVector_ref   = ComplexVector_ref<pointer>;
   using CMatrix_ref   = ComplexMatrix_ref<pointer>;
+  using C3Tensor_ref = Complex3Tensor_ref<pointer>; 
+  using C4Tensor_ref = ComplexArray_ref<4, pointer>;
   using C3Tensor_cref = boost::multi::array_ref<ComplexType const, 3, const_pointer>;
 
   using SpMatrix_cref = boost::multi::array_ref<SPComplexType const, 2, sp_pointer>;
@@ -338,50 +340,96 @@ class KP3IndexFactorization_batched
   {
     int nkpts = nopk.size();
     int NMO   = std::accumulate(nopk.begin(), nopk.end(), 0);
-    // in non-collinear case with SO, keep SO matrix here and add it
-    // for now, stay collinear
+    int npol = (walker_type == NONCOLLINEAR) ? 2 : 1;
 
     CVector vMF_(vMF);
-    CVector P1D(iextensions<1u>{NMO * NMO});
-    fill_n(P1D.origin(), P1D.num_elements(), ComplexType(0));
-    vHS(vMF_, P1D);
+    CVector P0D(iextensions<1u>{NMO * NMO});
+    fill_n(P0D.origin(), P0D.num_elements(), ComplexType(0));
+    vHS(vMF_, P0D);
     if (TG_.TG().size() > 1)
-      TG_.TG().all_reduce_in_place_n(to_address(P1D.origin()), P1D.num_elements(), std::plus<>());
+      TG_.TG().all_reduce_in_place_n(to_address(P0D.origin()), P0D.num_elements(), std::plus<>());
 
-    boost::multi::array<ComplexType, 2> P1({NMO, NMO});
-    copy_n(P1D.origin(), NMO * NMO, P1.origin());
+    boost::multi::array<ComplexType, 2> P0({NMO, NMO});
+    copy_n(P0D.origin(), NMO * NMO, P0.origin());
 
-    // add H1 + vn0 and symmetrize
-    using ma::conj;
+    boost::multi::array<ComplexType, 2> P1({npol*NMO, npol*NMO});
+    std::fill_n(P1.origin(), P1.num_elements(), ComplexType(0.0));
 
+    // add spin-dependent H1
+    for (int K = 0, nk0 = 0; K < nkpts; ++K)
+    { 
+      for (int i = 0, I = nk0; i < nopk[K]; i++, I++)
+      { 
+        for(int p=0; p<npol; ++p) 
+          P1[p*NMO+I][p*NMO+I] += H1[K][p*nopk[K]+i][p*nopk[K]+i];
+        for (int j = i + 1, J = I + 1; j < nopk[K]; j++, J++)
+        { 
+          for(int p=0; p<npol; ++p)
+          {   
+              P1[p*NMO+I][p*NMO+J] += H1[K][p*nopk[K]+i][p*nopk[K]+j];
+              P1[p*NMO+J][p*NMO+I] += H1[K][p*nopk[K]+j][p*nopk[K]+i];
+          }
+        }
+        if(walker_type == NONCOLLINEAR) {
+          // offdiagonal piece
+          for (int j = 0, J = nk0; j < nopk[K]; j++, J++)
+          { 
+            P1[I][NMO+J] += H1[K][i][nopk[K]+j];
+            P1[NMO+J][I] += H1[K][nopk[K]+j][i];
+          }
+        }
+      }
+      nk0 += nopk[K];
+    }
+
+    // add P0 (diagonal in spin)
+    for(int p=0; p<npol; ++p)
+      for(int I=0; I<NMO; I++)
+        for(int J=0; J<NMO; J++)
+          P1[p*NMO+I][p*NMO+J] += P0[I][J];
+
+    // add vn0 (diagonal in spin)
     for (int K = 0, nk0 = 0; K < nkpts; ++K)
     {
       for (int i = 0, I = nk0; i < nopk[K]; i++, I++)
       {
-        P1[I][I] += H1[K][i][i] + vn0[K][i][i];
+        for(int p=0; p<npol; ++p)
+          P1[p*NMO+I][p*NMO+I] += vn0[K][i][i];
         for (int j = i + 1, J = I + 1; j < nopk[K]; j++, J++)
         {
-          P1[I][J] += H1[K][i][j] + vn0[K][i][j];
-          P1[J][I] += H1[K][j][i] + vn0[K][j][i];
-          // This is really cutoff dependent!!!
-#if MIXED_PRECISION
-          if (std::abs(P1[I][J] - ma::conj(P1[J][I])) * 2.0 > 1e-5)
+          for(int p=0; p<npol; ++p)
           {
-#else
-          if (std::abs(P1[I][J] - ma::conj(P1[J][I])) * 2.0 > 1e-6)
-          {
-#endif
-            app_error() << " WARNING in getOneBodyPropagatorMatrix. H1 is not hermitian. \n";
-            app_error() << I << " " << J << " " << P1[I][J] << " " << P1[J][I] << " " << H1[K][i][j] << " "
-                        << H1[K][j][i] << " " << vn0[K][i][j] << " " << vn0[K][j][i] << std::endl;
-            //APP_ABORT("Error in getOneBodyPropagatorMatrix. H1 is not hermitian. \n");
+            P1[p*NMO+I][p*NMO+J] += vn0[K][i][j];
+            P1[p*NMO+J][p*NMO+I] += vn0[K][j][i];
           }
-          P1[I][J] = 0.5 * (P1[I][J] + ma::conj(P1[J][I]));
-          P1[J][I] = ma::conj(P1[I][J]);
         }
       }
       nk0 += nopk[K];
     }
+
+    using ma::conj;
+    // symmetrize
+    for(int I=0; I<npol*NMO; I++)
+    {
+      for(int J=I+1; J<npol*NMO; J++)
+      {
+        // This is really cutoff dependent!!!
+#if MIXED_PRECISION
+        if (std::abs(P1[I][J] - ma::conj(P1[J][I])) * 2.0 > 1e-5)
+        {
+#else
+        if (std::abs(P1[I][J] - ma::conj(P1[J][I])) * 2.0 > 1e-6)
+        {
+#endif
+          app_error() << " WARNING in getOneBodyPropagatorMatrix. H1 is not hermitian. \n";
+          app_error() << I << " " << J << " " << P1[I][J] << " " << P1[J][I] <<std::endl;
+                      //<< H1[K][i][j] << " "
+                      //<< H1[K][j][i] << " " << vn0[K][i][j] << " " << vn0[K][j][i] << std::endl;
+        }
+        P1[I][J] = 0.5 * (P1[I][J] + ma::conj(P1[J][I]));
+        P1[J][I] = ma::conj(P1[I][J]);
+      }
+    }
     return P1;
   }
 
@@ -438,6 +486,7 @@ class KP3IndexFactorization_batched
 
     int nwalk     = Gc.size(1);
     int nspin     = (walker_type == COLLINEAR ? 2 : 1);
+    int npol      = (walker_type == NONCOLLINEAR ? 2 : 1);
     int nmo_tot   = std::accumulate(nopk.begin(), nopk.end(), 0);
     int nmo_max   = *std::max_element(nopk.begin(), nopk.end());
     int nocca_tot = std::accumulate(nelpk[nd].begin(), nelpk[nd].begin() + nkpts, 0);
@@ -495,13 +544,13 @@ class KP3IndexFactorization_batched
     for (int n = 0; n < nwalk; n++)
       fill_n(E[n].origin(), 3, ComplexType(0.));
 
-    assert(Gc.num_elements() == nwalk * (nocca_tot + noccb_tot) * nmo_tot);
-    C3Tensor_cref G3Da(make_device_ptr(Gc.origin()), {nocca_tot, nmo_tot, nwalk});
+    assert(Gc.num_elements() == nwalk * (nocca_tot + noccb_tot) * npol * nmo_tot);
+    C3Tensor_cref G3Da(make_device_ptr(Gc.origin()), {nocca_tot*npol, nmo_tot, nwalk});
     C3Tensor_cref G3Db(make_device_ptr(Gc.origin()) + G3Da.num_elements() * (nspin - 1), {noccb_tot, nmo_tot, nwalk});
 
     // later on, rewrite routine to loop over spins, to avoid storage of both spin
     // components simultaneously
-    Static4Tensor GKK({nspin, nkpts, nkpts, nwalk * nmo_max * nocc_max},
+    Static4Tensor GKK({nspin, nkpts, nkpts, nwalk  * npol * nmo_max * nocc_max},
                       device_buffer_allocator->template get_allocator<SPComplexType>());
     GKaKjw_to_GKKwaj(G3Da, GKK[0], nelpk[nd].sliced(0, nkpts), dev_nelpk[nd], dev_a0pk[nd]);
     if (walker_type == COLLINEAR)
@@ -519,10 +568,11 @@ class KP3IndexFactorization_batched
       for (int K = 0; K < nkpts; ++K)
       {
 #if defined(MIXED_PRECISION)
-        CMatrix_ref haj_K(make_device_ptr(haj[nd * nkpts + K].origin()), {nocc_max, nmo_max});
+        C3Tensor_ref haj_K(make_device_ptr(haj[nd * nkpts + K].origin()), {nocc_max, npol, nmo_max});
         for (int a = 0; a < nelpk[nd][K]; ++a)
-          ma::product(ComplexType(1.), ma::T(G3Da[na + a].sliced(nk, nk + nopk[K])), haj_K[a].sliced(0, nopk[K]),
-                      ComplexType(1.), E({0, nwalk}, 0));
+          for (int pol = 0; pol < npol; ++pol)
+            ma::product(ComplexType(1.), ma::T(G3Da[(na + a)*npol+p].sliced(nk, nk + nopk[K])), 
+                        haj_K[a][pol].sliced(0, nopk[K]), ComplexType(1.), E({0, nwalk}, 0));
         na += nelpk[nd][K];
         if (walker_type == COLLINEAR)
         {
@@ -538,8 +588,8 @@ class KP3IndexFactorization_batched
         nk = nopk[K];
         {
           na = nelpk[nd][K];
-          CVector_ref haj_K(make_device_ptr(haj[nd * nkpts + K].origin()), {nocc_max * nmo_max});
-          SpMatrix_ref Gaj(GKK[0][K][K].origin(), {nwalk, nocc_max * nmo_max});
+          CVector_ref haj_K(make_device_ptr(haj[nd * nkpts + K].origin()), {nocc_max * npol * nmo_max});
+          SpMatrix_ref Gaj(GKK[0][K][K].origin(), {nwalk, nocc_max * npol * nmo_max});
           ma::product(ComplexType(1.), Gaj, haj_K, ComplexType(1.), E({0, nwalk}, 0));
         }
         if (walker_type == COLLINEAR)
@@ -582,7 +632,7 @@ class KP3IndexFactorization_batched
       // I WANT C++17!!!!!!
       long mem_ank(0);
       if (needs_copy)
-        mem_ank = nkpts * nocc_max * nchol_max * nmo_max;
+        mem_ank = nkpts * nocc_max * nchol_max * npol * nmo_max;
       StaticVector LBuff(iextensions<1u>{2 * mem_ank},
                          device_buffer_allocator->template get_allocator<SPComplexType>());
       sp_pointer LQptr(nullptr), LQmptr(nullptr);
@@ -660,8 +710,8 @@ class KP3IndexFactorization_batched
 
               if (batch_cnt >= batch_size)
               {
-                gemmBatched('T', 'N', nocc_max * nchol_max, nwalk * nocc_max, nmo_max, SPComplexType(1.0),
-                            Aarray.data(), nmo_max, Barray.data(), nmo_max, SPComplexType(0.0), Carray.data(),
+                gemmBatched('T', 'N', nocc_max * nchol_max, nwalk * nocc_max, npol*nmo_max, SPComplexType(1.0),
+                            Aarray.data(), npol*nmo_max, Barray.data(), npol*nmo_max, SPComplexType(0.0), Carray.data(),
                             nocc_max * nchol_max, Aarray.size());
 
                 copy_n(scl_factors.data(), scl_factors.size(), dev_scl_factors.origin());
@@ -691,8 +741,8 @@ class KP3IndexFactorization_batched
 
           if (batch_cnt > 0)
           {
-            gemmBatched('T', 'N', nocc_max * nchol_max, nwalk * nocc_max, nmo_max, SPComplexType(1.0), Aarray.data(),
-                        nmo_max, Barray.data(), nmo_max, SPComplexType(0.0), Carray.data(), nocc_max * nchol_max,
+            gemmBatched('T', 'N', nocc_max * nchol_max, nwalk * nocc_max, npol*nmo_max, SPComplexType(1.0), Aarray.data(),
+                        npol*nmo_max, Barray.data(), npol*nmo_max, SPComplexType(0.0), Carray.data(), nocc_max * nchol_max,
                         Aarray.size());
 
             copy_n(scl_factors.data(), scl_factors.size(), dev_scl_factors.origin());
@@ -1322,6 +1372,7 @@ class KP3IndexFactorization_batched
     assert(v.size(0) == 2 * local_nCV);
     assert(v.size(1) == nwalk);
     int nspin     = (walker_type == COLLINEAR ? 2 : 1);
+    int npol      = (walker_type == NONCOLLINEAR ? 2 : 1);
     int nmo_tot   = std::accumulate(nopk.begin(), nopk.end(), 0);
     int nmo_max   = *std::max_element(nopk.begin(), nopk.end());
     int nocca_tot = std::accumulate(nelpk[nd].begin(), nelpk[nd].begin() + nkpts, 0);
@@ -1340,11 +1391,11 @@ class KP3IndexFactorization_batched
     SPComplexType minusimhalfa(0.0, -0.5 * a * scl);
     SPComplexType imhalfa(0.0, 0.5 * a * scl);
 
-    assert(G.num_elements() == nwalk * (nocca_tot + noccb_tot) * nmo_tot);
+    assert(G.num_elements() == nwalk * (nocca_tot + noccb_tot) * npol * nmo_tot);
     // MAM: use reshape when available, then no need to deal with types
     using GType = typename std::decay<MatA>::type::element;
     boost::multi::array_ref<GType const, 3, decltype(make_device_ptr(G.origin()))> G3Da(make_device_ptr(G.origin()),
-                                                                                        {nocca_tot, nmo_tot, nwalk});
+                                                                                        {nocca_tot*npol, nmo_tot, nwalk});
     boost::multi::array_ref<GType const, 3, decltype(make_device_ptr(G.origin()))> G3Db(make_device_ptr(G.origin()) +
                                                                                             G3Da.num_elements() *
                                                                                                 (nspin - 1),
@@ -1358,7 +1409,7 @@ class KP3IndexFactorization_batched
       size_t cnt(0);
       Static3Tensor v1({nkpts + number_of_symmetric_Q, nchol_max, nwalk},
                        device_buffer_allocator->template get_allocator<SPComplexType>());
-      Static3Tensor GQ({nkpts, nkpts * nocc_max * nmo_max, nwalk},
+      Static3Tensor GQ({nkpts, nkpts * nocc_max * npol * nmo_max, nwalk},
                        device_buffer_allocator->template get_allocator<SPComplexType>());
       fill_n(v1.origin(), v1.num_elements(), SPComplexType(0.0));
       fill_n(GQ.origin(), GQ.num_elements(), SPComplexType(0.0));
@@ -1370,7 +1421,7 @@ class KP3IndexFactorization_batched
                          dev_a0pk[nd].sliced(nkpts, 2 * nkpts));
 
       // can use productStridedBatched if LQKakn is changed to a 3Tensor array
-      int Kak = nkpts * nocc_max * nmo_max;
+      int Kak = nkpts * nocc_max * npol * nmo_max;
       std::vector<sp_pointer> Aarray;
       std::vector<sp_pointer> Barray;
       std::vector<sp_pointer> Carray;
@@ -1530,30 +1581,32 @@ class KP3IndexFactorization_batched
   template<class MatA, class MatB, class IVec, class IVec2>
   void GKaKjw_to_GKKwaj(MatA const& GKaKj, MatB&& GKKaj, IVec&& nocc, IVec2&& dev_no, IVec2&& dev_a0)
   {
+    int npol = (walker_type == NONCOLLINEAR) ? 2 : 1;
     int nmo_max = *std::max_element(nopk.begin(), nopk.end());
     //      int nocc_max = *std::max_element(nocc.begin(),nocc.end());
     int nmo_tot = GKaKj.size(1);
     int nwalk   = GKaKj.size(2);
     int nkpts   = nopk.size();
-    assert(GKKaj.num_elements() >= nkpts * nkpts * nwalk * nocc_max * nmo_max);
+    assert(GKKaj.num_elements() >= nkpts * nkpts * nwalk * nocc_max * npol * nmo_max);
 
     using ma::KaKjw_to_KKwaj;
-    KaKjw_to_KKwaj(nwalk, nkpts, nmo_max, nmo_tot, nocc_max, dev_nopk.origin(), dev_i0pk.origin(), dev_no.origin(),
+    KaKjw_to_KKwaj(nwalk, nkpts, npol, nmo_max, nmo_tot, nocc_max, dev_nopk.origin(), dev_i0pk.origin(), dev_no.origin(),
                    dev_a0.origin(), GKaKj.origin(), GKKaj.origin());
   }
 
   template<class MatA, class MatB, class IVec, class IVec2>
   void GKaKjw_to_GQKajw(MatA const& GKaKj, MatB&& GQKaj, IVec&& nocc, IVec2&& dev_no, IVec2&& dev_a0)
   {
+    int npol = (walker_type == NONCOLLINEAR) ? 2 : 1;
     int nmo_max = *std::max_element(nopk.begin(), nopk.end());
     //      int nocc_max = *std::max_element(nocc.begin(),nocc.end());
     int nmo_tot = GKaKj.size(1);
     int nwalk   = GKaKj.size(2);
     int nkpts   = nopk.size();
-    assert(GQKaj.num_elements() >= nkpts * nkpts * nwalk * nocc_max * nmo_max);
+    assert(GQKaj.num_elements() >= nkpts * nkpts * nwalk * nocc_max * npol * nmo_max);
 
     using ma::KaKjw_to_QKajw;
-    KaKjw_to_QKajw(nwalk, nkpts, nmo_max, nmo_tot, nocc_max, dev_nopk.origin(), dev_i0pk.origin(), dev_no.origin(),
+    KaKjw_to_QKajw(nwalk, nkpts, npol, nmo_max, nmo_tot, nocc_max, dev_nopk.origin(), dev_i0pk.origin(), dev_no.origin(),
                    dev_a0.origin(), dev_QKToK2.origin(), GKaKj.origin(), GQKaj.origin());
   }
 
diff --git a/src/AFQMC/Hamiltonians/KPFactorizedHamiltonian.cpp b/src/AFQMC/Hamiltonians/KPFactorizedHamiltonian.cpp
index 27b9a7ee72..ff281cd796 100644
--- a/src/AFQMC/Hamiltonians/KPFactorizedHamiltonian.cpp
+++ b/src/AFQMC/Hamiltonians/KPFactorizedHamiltonian.cpp
@@ -1143,7 +1143,7 @@ HamiltonianOperations KPFactorizedHamiltonian::getHamiltonianOperations_batched(
   for (int nd = 0; nd < ndet; nd++)
   {
     for (int Q = 0; Q < number_of_symmetric_Q; Q++)
-      LQKbnl.emplace_back(shmSpMatrix({nkpts, ank_max}, shared_allocator<SPComplexType>{distNode}));
+      LQKbnl.emplace_back(shmSpMatrix({nkpts, npol*ank_max}, shared_allocator<SPComplexType>{distNode}));
     if (type == COLLINEAR)
     {
       for (int Q = 0; Q < number_of_symmetric_Q; Q++)
@@ -1161,7 +1161,7 @@ HamiltonianOperations KPFactorizedHamiltonian::getHamiltonianOperations_batched(
   {
     for (int Q = 0; Q < number_of_symmetric_Q; Q++)
     {
-      LQKbln.emplace_back(shmSpMatrix({nkpts, ank_max}, shared_allocator<SPComplexType>{distNode}));
+      LQKbln.emplace_back(shmSpMatrix({nkpts, npol*ank_max}, shared_allocator<SPComplexType>{distNode}));
     }
     if (type == COLLINEAR)
     {
diff --git a/src/AFQMC/Numerics/detail/CUDA/Kernels/KaKjw_to_KKwaj.cu b/src/AFQMC/Numerics/detail/CUDA/Kernels/KaKjw_to_KKwaj.cu
index acdc06c653..45c56c7c90 100644
--- a/src/AFQMC/Numerics/detail/CUDA/Kernels/KaKjw_to_KKwaj.cu
+++ b/src/AFQMC/Numerics/detail/CUDA/Kernels/KaKjw_to_KKwaj.cu
@@ -24,11 +24,12 @@
 namespace kernels
 {
 // very sloppy, needs improvement!!!!
-// A[nocc_tot][nmo_tot][nwalk]
-// B[Ka][Kj][nwalk][nocc_max][nmo_max]
+// A[nocc_tot][s][nmo_tot][nwalk]
+// B[Ka][Kj][nwalk][nocc_max][s][nmo_max]
 template<typename T, typename T2>
 __global__ void kernel_KaKjw_to_KKwaj(int nwalk,
                                       int nkpts,
+                                      int npol,
                                       int nmo_max,
                                       int nmo_tot,
                                       int nocc_max,
@@ -41,30 +42,32 @@ __global__ void kernel_KaKjw_to_KKwaj(int nwalk,
 {
   int Ka = blockIdx.x;
   int Kj = blockIdx.y;
-  if (Ka >= nkpts || Kj >= nkpts)
+  int pol = blockIdx.z;
+  if (Ka >= nkpts || Kj >= nkpts || pol >= npol)
     return;
   int na0 = nocc0[Ka];
   int nj0 = nmo0[Kj];
   int na  = nocc[Ka];
   int nj  = nmo[Kj];
 
-  T const* A_(A + (na0 * nmo_tot + nj0) * nwalk);
-  T2* B_(B + ((Ka * nkpts + Kj) * nocc_max) * nmo_max * nwalk);
+  T const* A_(A + (na0 * npol * nmo_tot + nj0) * nwalk);
+  T2* B_(B + ((Ka * nkpts + Kj) * nocc_max) * npol * nmo_max * nwalk);
 
   if (threadIdx.x >= nj)
     return;
   if (threadIdx.y >= nwalk)
     return;
 
-  for (int a = 0, a1 = 0; a < na; a++, a1 += nmo_tot * nwalk)
+  for (int a = 0, a1 = pol * nmo_tot * nwalk; a < na; a++, a1 += npol * nmo_tot * nwalk)
     for (int j = threadIdx.x; j < nj; j += blockDim.x)
       for (int n = threadIdx.y; n < nwalk; n += blockDim.y)
-        B_[n * nocc_max * nmo_max + a * nmo_max + j] = static_cast<T2>(A_[a1 + j * nwalk + n]);
+        B_[((n * nocc_max + a) * npol + pol) * nmo_max + j] = static_cast<T2>(A_[a1 + j * nwalk + n]);
 }
 
 template<typename T, typename T2>
 __global__ void kernel_KaKjw_to_KKwaj(int nwalk,
                                       int nkpts,
+                                      int npol,
                                       int nmo_max,
                                       int nmo_tot,
                                       int nocc_max,
@@ -77,30 +80,32 @@ __global__ void kernel_KaKjw_to_KKwaj(int nwalk,
 {
   int Ka = blockIdx.x;
   int Kj = blockIdx.y;
-  if (Ka >= nkpts || Kj >= nkpts)
+  int pol = blockIdx.z;
+  if (Ka >= nkpts || Kj >= nkpts || pol >= npol)
     return;
   int na0 = nocc0[Ka];
   int nj0 = nmo0[Kj];
   int na  = nocc[Ka];
   int nj  = nmo[Kj];
 
-  thrust::complex<T> const* A_(A + (na0 * nmo_tot + nj0) * nwalk);
-  thrust::complex<T2>* B_(B + ((Ka * nkpts + Kj) * nocc_max) * nmo_max * nwalk);
+  thrust::complex<T> const* A_(A + (na0 * npol * nmo_tot + nj0) * nwalk);
+  thrust::complex<T2>* B_(B + ((Ka * nkpts + Kj) * nocc_max) * npol * nmo_max * nwalk);
 
   if (threadIdx.x >= nj)
     return;
   if (threadIdx.y >= nwalk)
     return;
 
-  for (int a = 0, a1 = 0; a < na; a++, a1 += nmo_tot * nwalk)
+  for (int a = 0, a1 = pol * nmo_tot * nwalk; a < na; a++, a1 += npol * nmo_tot * nwalk)
     for (int j = threadIdx.x; j < nj; j += blockDim.x)
       for (int n = threadIdx.y; n < nwalk; n += blockDim.y)
-        B_[n * nocc_max * nmo_max + a * nmo_max + j] = static_cast<thrust::complex<T2>>(A_[a1 + j * nwalk + n]);
+        B_[((n * nocc_max + a) * npol + pol) * nmo_max + j] = static_cast<thrust::complex<T2>>(A_[a1 + j * nwalk + n]);
 }
 
 
 void KaKjw_to_KKwaj(int nwalk,
                     int nkpts,
+                    int npol,
                     int nmo_max,
                     int nmo_tot,
                     int nocc_max,
@@ -114,8 +119,8 @@ void KaKjw_to_KKwaj(int nwalk,
   int xblock_dim = 16;
   int yblock_dim = std::min(nwalk, 32);
   dim3 block_dim(xblock_dim, yblock_dim, 1);
-  dim3 grid_dim(nkpts, nkpts, 1);
-  kernel_KaKjw_to_KKwaj<<<grid_dim, block_dim>>>(nwalk, nkpts, nmo_max, nmo_tot, nocc_max, nmo, nmo0, nocc, nocc0, A,
+  dim3 grid_dim(nkpts, nkpts, npol);
+  kernel_KaKjw_to_KKwaj<<<grid_dim, block_dim>>>(nwalk, nkpts, npol, nmo_max, nmo_tot, nocc_max, nmo, nmo0, nocc, nocc0, A,
                                                  B);
   qmc_cuda::cuda_check(cudaGetLastError(), "KaKjw_to_KKwaj");
   qmc_cuda::cuda_check(cudaDeviceSynchronize(), "KaKjw_to_KKwaj");
@@ -123,6 +128,7 @@ void KaKjw_to_KKwaj(int nwalk,
 
 void KaKjw_to_KKwaj(int nwalk,
                     int nkpts,
+                    int npol,
                     int nmo_max,
                     int nmo_tot,
                     int nocc_max,
@@ -136,8 +142,8 @@ void KaKjw_to_KKwaj(int nwalk,
   int xblock_dim = 16;
   int yblock_dim = std::min(nwalk, 32);
   dim3 block_dim(xblock_dim, yblock_dim, 1);
-  dim3 grid_dim(nkpts, nkpts, 1);
-  kernel_KaKjw_to_KKwaj<<<grid_dim, block_dim>>>(nwalk, nkpts, nmo_max, nmo_tot, nocc_max, nmo, nmo0, nocc, nocc0, A,
+  dim3 grid_dim(nkpts, nkpts, npol);
+  kernel_KaKjw_to_KKwaj<<<grid_dim, block_dim>>>(nwalk, nkpts, npol, nmo_max, nmo_tot, nocc_max, nmo, nmo0, nocc, nocc0, A,
                                                  B);
   qmc_cuda::cuda_check(cudaGetLastError(), "KaKjw_to_KKwaj");
   qmc_cuda::cuda_check(cudaDeviceSynchronize(), "KaKjw_to_KKwaj");
@@ -145,6 +151,7 @@ void KaKjw_to_KKwaj(int nwalk,
 
 void KaKjw_to_KKwaj(int nwalk,
                     int nkpts,
+                    int npol,
                     int nmo_max,
                     int nmo_tot,
                     int nocc_max,
@@ -158,8 +165,8 @@ void KaKjw_to_KKwaj(int nwalk,
   int xblock_dim = 16;
   int yblock_dim = std::min(nwalk, 32);
   dim3 block_dim(xblock_dim, yblock_dim, 1);
-  dim3 grid_dim(nkpts, nkpts, 1);
-  kernel_KaKjw_to_KKwaj<<<grid_dim, block_dim>>>(nwalk, nkpts, nmo_max, nmo_tot, nocc_max, nmo, nmo0, nocc, nocc0, A,
+  dim3 grid_dim(nkpts, nkpts, npol);
+  kernel_KaKjw_to_KKwaj<<<grid_dim, block_dim>>>(nwalk, nkpts, npol, nmo_max, nmo_tot, nocc_max, nmo, nmo0, nocc, nocc0, A,
                                                  B);
   qmc_cuda::cuda_check(cudaGetLastError(), "KaKjw_to_KKwaj");
   qmc_cuda::cuda_check(cudaDeviceSynchronize(), "KaKjw_to_KKwaj");
@@ -167,6 +174,7 @@ void KaKjw_to_KKwaj(int nwalk,
 
 void KaKjw_to_KKwaj(int nwalk,
                     int nkpts,
+                    int npol,
                     int nmo_max,
                     int nmo_tot,
                     int nocc_max,
@@ -180,8 +188,8 @@ void KaKjw_to_KKwaj(int nwalk,
   int xblock_dim = 16;
   int yblock_dim = std::min(nwalk, 32);
   dim3 block_dim(xblock_dim, yblock_dim, 1);
-  dim3 grid_dim(nkpts, nkpts, 1);
-  kernel_KaKjw_to_KKwaj<<<grid_dim, block_dim>>>(nwalk, nkpts, nmo_max, nmo_tot, nocc_max, nmo, nmo0, nocc, nocc0,
+  dim3 grid_dim(nkpts, nkpts, npol);
+  kernel_KaKjw_to_KKwaj<<<grid_dim, block_dim>>>(nwalk, nkpts, npol, nmo_max, nmo_tot, nocc_max, nmo, nmo0, nocc, nocc0,
                                                  reinterpret_cast<thrust::complex<double> const*>(A),
                                                  reinterpret_cast<thrust::complex<double>*>(B));
   qmc_cuda::cuda_check(cudaGetLastError(), "KaKjw_to_KKwaj");
@@ -190,6 +198,7 @@ void KaKjw_to_KKwaj(int nwalk,
 
 void KaKjw_to_KKwaj(int nwalk,
                     int nkpts,
+                    int npol,
                     int nmo_max,
                     int nmo_tot,
                     int nocc_max,
@@ -203,8 +212,8 @@ void KaKjw_to_KKwaj(int nwalk,
   int xblock_dim = 16;
   int yblock_dim = std::min(nwalk, 32);
   dim3 block_dim(xblock_dim, yblock_dim, 1);
-  dim3 grid_dim(nkpts, nkpts, 1);
-  kernel_KaKjw_to_KKwaj<<<grid_dim, block_dim>>>(nwalk, nkpts, nmo_max, nmo_tot, nocc_max, nmo, nmo0, nocc, nocc0,
+  dim3 grid_dim(nkpts, nkpts, npol);
+  kernel_KaKjw_to_KKwaj<<<grid_dim, block_dim>>>(nwalk, nkpts, npol, nmo_max, nmo_tot, nocc_max, nmo, nmo0, nocc, nocc0,
                                                  reinterpret_cast<thrust::complex<float> const*>(A),
                                                  reinterpret_cast<thrust::complex<float>*>(B));
   qmc_cuda::cuda_check(cudaGetLastError(), "KaKjw_to_KKwaj");
@@ -213,6 +222,7 @@ void KaKjw_to_KKwaj(int nwalk,
 
 void KaKjw_to_KKwaj(int nwalk,
                     int nkpts,
+                    int npol,
                     int nmo_max,
                     int nmo_tot,
                     int nocc_max,
@@ -226,8 +236,8 @@ void KaKjw_to_KKwaj(int nwalk,
   int xblock_dim = 16;
   int yblock_dim = std::min(nwalk, 32);
   dim3 block_dim(xblock_dim, yblock_dim, 1);
-  dim3 grid_dim(nkpts, nkpts, 1);
-  kernel_KaKjw_to_KKwaj<<<grid_dim, block_dim>>>(nwalk, nkpts, nmo_max, nmo_tot, nocc_max, nmo, nmo0, nocc, nocc0,
+  dim3 grid_dim(nkpts, nkpts, npol);
+  kernel_KaKjw_to_KKwaj<<<grid_dim, block_dim>>>(nwalk, nkpts, npol, nmo_max, nmo_tot, nocc_max, nmo, nmo0, nocc, nocc0,
                                                  reinterpret_cast<thrust::complex<double> const*>(A),
                                                  reinterpret_cast<thrust::complex<float>*>(B));
   qmc_cuda::cuda_check(cudaGetLastError(), "KaKjw_to_KKwaj");
diff --git a/src/AFQMC/Numerics/detail/CUDA/Kernels/KaKjw_to_KKwaj.cuh b/src/AFQMC/Numerics/detail/CUDA/Kernels/KaKjw_to_KKwaj.cuh
index 0553530836..cb2ee706b5 100644
--- a/src/AFQMC/Numerics/detail/CUDA/Kernels/KaKjw_to_KKwaj.cuh
+++ b/src/AFQMC/Numerics/detail/CUDA/Kernels/KaKjw_to_KKwaj.cuh
@@ -23,6 +23,7 @@ namespace kernels
 {
 void KaKjw_to_KKwaj(int nwalk,
                     int nkpts,
+                    int npol,
                     int nmo_max,
                     int nmo_tot,
                     int nocc_max,
@@ -34,6 +35,7 @@ void KaKjw_to_KKwaj(int nwalk,
                     double* B);
 void KaKjw_to_KKwaj(int nwalk,
                     int nkpts,
+                    int npol,
                     int nmo_max,
                     int nmo_tot,
                     int nocc_max,
@@ -45,6 +47,7 @@ void KaKjw_to_KKwaj(int nwalk,
                     float* B);
 void KaKjw_to_KKwaj(int nwalk,
                     int nkpts,
+                    int npol,
                     int nmo_max,
                     int nmo_tot,
                     int nocc_max,
@@ -56,6 +59,7 @@ void KaKjw_to_KKwaj(int nwalk,
                     float* B);
 void KaKjw_to_KKwaj(int nwalk,
                     int nkpts,
+                    int npol,
                     int nmo_max,
                     int nmo_tot,
                     int nocc_max,
@@ -67,6 +71,7 @@ void KaKjw_to_KKwaj(int nwalk,
                     std::complex<double>* B);
 void KaKjw_to_KKwaj(int nwalk,
                     int nkpts,
+                    int npol,
                     int nmo_max,
                     int nmo_tot,
                     int nocc_max,
@@ -78,6 +83,7 @@ void KaKjw_to_KKwaj(int nwalk,
                     std::complex<float>* B);
 void KaKjw_to_KKwaj(int nwalk,
                     int nkpts,
+                    int npol,
                     int nmo_max,
                     int nmo_tot,
                     int nocc_max,
diff --git a/src/AFQMC/Numerics/detail/CUDA/Kernels/KaKjw_to_QKajw.cu b/src/AFQMC/Numerics/detail/CUDA/Kernels/KaKjw_to_QKajw.cu
index 92f84e1675..f6527b0e72 100644
--- a/src/AFQMC/Numerics/detail/CUDA/Kernels/KaKjw_to_QKajw.cu
+++ b/src/AFQMC/Numerics/detail/CUDA/Kernels/KaKjw_to_QKajw.cu
@@ -29,6 +29,7 @@ namespace kernels
 template<typename T, typename T2>
 __global__ void kernel_KaKjw_to_QKajw(int nwalk,
                                       int nkpts,
+                                      int npol,
                                       int nmo_max,
                                       int nmo_tot,
                                       int nocc_max,
@@ -42,7 +43,8 @@ __global__ void kernel_KaKjw_to_QKajw(int nwalk,
 {
   int Q = blockIdx.x;
   int K = blockIdx.y;
-  if (Q >= nkpts || K >= nkpts)
+  int pol = blockIdx.z;
+  if (Q >= nkpts || K >= nkpts || pol > npol)
     return;
   int QK  = QKtok2[Q * nkpts + K];
   int na0 = nocc0[K];
@@ -50,15 +52,16 @@ __global__ void kernel_KaKjw_to_QKajw(int nwalk,
   int na  = nocc[K];
   int nj  = nmo[QK];
 
-  T const* A_(A + (na0 * nmo_tot + nj0) * nwalk);
-  T2* B_(B + ((Q * nkpts + K) * nocc_max) * nmo_max * nwalk);
+  T const* A_(A + (na0 * npol * nmo_tot + nj0) * nwalk);
+  T2* B_(B + ((Q * nkpts + K) * nocc_max) * npol * nmo_max * nwalk);
 
   if (threadIdx.x >= nj)
     return;
   if (threadIdx.y >= nwalk)
     return;
 
-  for (int a = 0, a0 = 0, a1 = 0; a < na; a++, a0 += nmo_max * nwalk, a1 += nmo_tot * nwalk)
+  for (int a = 0, a0 = pol*nmo_max*nwalk, a1 = pol*nmo_tot*nwalk; 
+            a < na; a++, a0 += npol * nmo_max * nwalk, a1 += npol * nmo_tot * nwalk)
     for (int j = threadIdx.x; j < nj; j += blockDim.x)
       for (int n = threadIdx.y; n < nwalk; n += blockDim.y)
         B_[a0 + j * nwalk + n] = static_cast<T2>(A_[a1 + j * nwalk + n]);
@@ -67,6 +70,7 @@ __global__ void kernel_KaKjw_to_QKajw(int nwalk,
 template<typename T, typename T2>
 __global__ void kernel_KaKjw_to_QKajw(int nwalk,
                                       int nkpts,
+                                      int npol,
                                       int nmo_max,
                                       int nmo_tot,
                                       int nocc_max,
@@ -80,7 +84,8 @@ __global__ void kernel_KaKjw_to_QKajw(int nwalk,
 {
   int Q = blockIdx.x;
   int K = blockIdx.y;
-  if (Q >= nkpts || K >= nkpts)
+  int pol = blockIdx.z;
+  if (Q >= nkpts || K >= nkpts || pol > npol)
     return;
   int QK  = QKtok2[Q * nkpts + K];
   int na0 = nocc0[K];
@@ -88,15 +93,16 @@ __global__ void kernel_KaKjw_to_QKajw(int nwalk,
   int na  = nocc[K];
   int nj  = nmo[QK];
 
-  thrust::complex<T> const* A_(A + (na0 * nmo_tot + nj0) * nwalk);
-  thrust::complex<T2>* B_(B + ((Q * nkpts + K) * nocc_max) * nmo_max * nwalk);
+  thrust::complex<T> const* A_(A + (na0 * npol * nmo_tot + nj0) * nwalk);
+  thrust::complex<T2>* B_(B + ((Q * nkpts + K) * nocc_max) * npol * nmo_max * nwalk);
 
   if (threadIdx.x >= nj)
     return;
   if (threadIdx.y >= nwalk)
     return;
 
-  for (int a = 0, a0 = 0, a1 = 0; a < na; a++, a0 += nmo_max * nwalk, a1 += nmo_tot * nwalk)
+  for (int a = 0, a0 = pol*nmo_max*nwalk, a1 = pol*nmo_tot*nwalk; 
+                a < na; a++, a0 += npol*nmo_max * nwalk, a1 += npol * nmo_tot * nwalk)
   {
     for (int j = threadIdx.x; j < nj; j += blockDim.x)
       for (int n = threadIdx.y; n < nwalk; n += blockDim.y)
@@ -106,6 +112,7 @@ __global__ void kernel_KaKjw_to_QKajw(int nwalk,
 
 void KaKjw_to_QKajw(int nwalk,
                     int nkpts,
+                    int npol,
                     int nmo_max,
                     int nmo_tot,
                     int nocc_max,
@@ -120,8 +127,8 @@ void KaKjw_to_QKajw(int nwalk,
   int xblock_dim = 16;
   int yblock_dim = std::min(nwalk, 32);
   dim3 block_dim(xblock_dim, yblock_dim, 1);
-  dim3 grid_dim(nkpts, nkpts, 1);
-  kernel_KaKjw_to_QKajw<<<grid_dim, block_dim>>>(nwalk, nkpts, nmo_max, nmo_tot, nocc_max, nmo, nmo0, nocc, nocc0,
+  dim3 grid_dim(nkpts, nkpts, npol);
+  kernel_KaKjw_to_QKajw<<<grid_dim, block_dim>>>(nwalk, nkpts, npol, nmo_max, nmo_tot, nocc_max, nmo, nmo0, nocc, nocc0,
                                                  QKtok2, A, B);
   qmc_cuda::cuda_check(cudaGetLastError(), "KaKjw_to_QKajw");
   qmc_cuda::cuda_check(cudaDeviceSynchronize(), "KaKjw_to_QKajw");
@@ -129,6 +136,7 @@ void KaKjw_to_QKajw(int nwalk,
 
 void KaKjw_to_QKajw(int nwalk,
                     int nkpts,
+                    int npol,
                     int nmo_max,
                     int nmo_tot,
                     int nocc_max,
@@ -143,8 +151,8 @@ void KaKjw_to_QKajw(int nwalk,
   int xblock_dim = 16;
   int yblock_dim = std::min(nwalk, 32);
   dim3 block_dim(xblock_dim, yblock_dim, 1);
-  dim3 grid_dim(nkpts, nkpts, 1);
-  kernel_KaKjw_to_QKajw<<<grid_dim, block_dim>>>(nwalk, nkpts, nmo_max, nmo_tot, nocc_max, nmo, nmo0, nocc, nocc0,
+  dim3 grid_dim(nkpts, nkpts, npol);
+  kernel_KaKjw_to_QKajw<<<grid_dim, block_dim>>>(nwalk, nkpts, npol, nmo_max, nmo_tot, nocc_max, nmo, nmo0, nocc, nocc0,
                                                  QKtok2, A, B);
   qmc_cuda::cuda_check(cudaGetLastError(), "KaKjw_to_QKajw");
   qmc_cuda::cuda_check(cudaDeviceSynchronize(), "KaKjw_to_QKajw");
@@ -152,6 +160,7 @@ void KaKjw_to_QKajw(int nwalk,
 
 void KaKjw_to_QKajw(int nwalk,
                     int nkpts,
+                    int npol,
                     int nmo_max,
                     int nmo_tot,
                     int nocc_max,
@@ -166,8 +175,8 @@ void KaKjw_to_QKajw(int nwalk,
   int xblock_dim = 16;
   int yblock_dim = std::min(nwalk, 32);
   dim3 block_dim(xblock_dim, yblock_dim, 1);
-  dim3 grid_dim(nkpts, nkpts, 1);
-  kernel_KaKjw_to_QKajw<<<grid_dim, block_dim>>>(nwalk, nkpts, nmo_max, nmo_tot, nocc_max, nmo, nmo0, nocc, nocc0,
+  dim3 grid_dim(nkpts, nkpts, npol);
+  kernel_KaKjw_to_QKajw<<<grid_dim, block_dim>>>(nwalk, nkpts, npol, nmo_max, nmo_tot, nocc_max, nmo, nmo0, nocc, nocc0,
                                                  QKtok2, A, B);
   qmc_cuda::cuda_check(cudaGetLastError(), "KaKjw_to_QKajw");
   qmc_cuda::cuda_check(cudaDeviceSynchronize(), "KaKjw_to_QKajw");
@@ -175,6 +184,7 @@ void KaKjw_to_QKajw(int nwalk,
 
 void KaKjw_to_QKajw(int nwalk,
                     int nkpts,
+                    int npol,
                     int nmo_max,
                     int nmo_tot,
                     int nocc_max,
@@ -189,8 +199,8 @@ void KaKjw_to_QKajw(int nwalk,
   int xblock_dim = 16;
   int yblock_dim = std::min(nwalk, 32);
   dim3 block_dim(xblock_dim, yblock_dim, 1);
-  dim3 grid_dim(nkpts, nkpts, 1);
-  kernel_KaKjw_to_QKajw<<<grid_dim, block_dim>>>(nwalk, nkpts, nmo_max, nmo_tot, nocc_max, nmo, nmo0, nocc, nocc0,
+  dim3 grid_dim(nkpts, nkpts, npol);
+  kernel_KaKjw_to_QKajw<<<grid_dim, block_dim>>>(nwalk, nkpts, npol, nmo_max, nmo_tot, nocc_max, nmo, nmo0, nocc, nocc0,
                                                  QKtok2, reinterpret_cast<thrust::complex<float> const*>(A),
                                                  reinterpret_cast<thrust::complex<float>*>(B));
   qmc_cuda::cuda_check(cudaGetLastError(), "KaKjw_to_QKajw");
@@ -199,6 +209,7 @@ void KaKjw_to_QKajw(int nwalk,
 
 void KaKjw_to_QKajw(int nwalk,
                     int nkpts,
+                    int npol,
                     int nmo_max,
                     int nmo_tot,
                     int nocc_max,
@@ -213,8 +224,8 @@ void KaKjw_to_QKajw(int nwalk,
   int xblock_dim = 16;
   int yblock_dim = std::min(nwalk, 32);
   dim3 block_dim(xblock_dim, yblock_dim, 1);
-  dim3 grid_dim(nkpts, nkpts, 1);
-  kernel_KaKjw_to_QKajw<<<grid_dim, block_dim>>>(nwalk, nkpts, nmo_max, nmo_tot, nocc_max, nmo, nmo0, nocc, nocc0,
+  dim3 grid_dim(nkpts, nkpts, npol);
+  kernel_KaKjw_to_QKajw<<<grid_dim, block_dim>>>(nwalk, nkpts, npol, nmo_max, nmo_tot, nocc_max, nmo, nmo0, nocc, nocc0,
                                                  QKtok2, reinterpret_cast<thrust::complex<double> const*>(A),
                                                  reinterpret_cast<thrust::complex<double>*>(B));
   qmc_cuda::cuda_check(cudaGetLastError(), "KaKjw_to_QKajw");
@@ -223,6 +234,7 @@ void KaKjw_to_QKajw(int nwalk,
 
 void KaKjw_to_QKajw(int nwalk,
                     int nkpts,
+                    int npol,
                     int nmo_max,
                     int nmo_tot,
                     int nocc_max,
@@ -237,8 +249,8 @@ void KaKjw_to_QKajw(int nwalk,
   int xblock_dim = 16;
   int yblock_dim = std::min(nwalk, 32);
   dim3 block_dim(xblock_dim, yblock_dim, 1);
-  dim3 grid_dim(nkpts, nkpts, 1);
-  kernel_KaKjw_to_QKajw<<<grid_dim, block_dim>>>(nwalk, nkpts, nmo_max, nmo_tot, nocc_max, nmo, nmo0, nocc, nocc0,
+  dim3 grid_dim(nkpts, nkpts, npol);
+  kernel_KaKjw_to_QKajw<<<grid_dim, block_dim>>>(nwalk, nkpts, npol, nmo_max, nmo_tot, nocc_max, nmo, nmo0, nocc, nocc0,
                                                  QKtok2, reinterpret_cast<thrust::complex<double> const*>(A),
                                                  reinterpret_cast<thrust::complex<float>*>(B));
   qmc_cuda::cuda_check(cudaGetLastError(), "KaKjw_to_QKajw");
diff --git a/src/AFQMC/Numerics/detail/CUDA/Kernels/KaKjw_to_QKajw.cuh b/src/AFQMC/Numerics/detail/CUDA/Kernels/KaKjw_to_QKajw.cuh
index 164e6c5988..0315a29b9a 100644
--- a/src/AFQMC/Numerics/detail/CUDA/Kernels/KaKjw_to_QKajw.cuh
+++ b/src/AFQMC/Numerics/detail/CUDA/Kernels/KaKjw_to_QKajw.cuh
@@ -23,6 +23,7 @@ namespace kernels
 {
 void KaKjw_to_QKajw(int nwalk,
                     int nkpts,
+                    int npol,
                     int nmo_max,
                     int nmo_tot,
                     int nocc_max,
@@ -35,6 +36,7 @@ void KaKjw_to_QKajw(int nwalk,
                     double* B);
 void KaKjw_to_QKajw(int nwalk,
                     int nkpts,
+                    int npol,
                     int nmo_max,
                     int nmo_tot,
                     int nocc_max,
@@ -47,6 +49,7 @@ void KaKjw_to_QKajw(int nwalk,
                     float* B);
 void KaKjw_to_QKajw(int nwalk,
                     int nkpts,
+                    int npol,
                     int nmo_max,
                     int nmo_tot,
                     int nocc_max,
@@ -59,6 +62,7 @@ void KaKjw_to_QKajw(int nwalk,
                     float* B);
 void KaKjw_to_QKajw(int nwalk,
                     int nkpts,
+                    int npol,
                     int nmo_max,
                     int nmo_tot,
                     int nocc_max,
@@ -71,6 +75,7 @@ void KaKjw_to_QKajw(int nwalk,
                     std::complex<float>* B);
 void KaKjw_to_QKajw(int nwalk,
                     int nkpts,
+                    int npol,
                     int nmo_max,
                     int nmo_tot,
                     int nocc_max,
@@ -83,6 +88,7 @@ void KaKjw_to_QKajw(int nwalk,
                     std::complex<double>* B);
 void KaKjw_to_QKajw(int nwalk,
                     int nkpts,
+                    int npol,
                     int nmo_max,
                     int nmo_tot,
                     int nocc_max,
diff --git a/src/AFQMC/Numerics/detail/HIP/Kernels/KaKjw_to_KKwaj.hip.cpp b/src/AFQMC/Numerics/detail/HIP/Kernels/KaKjw_to_KKwaj.hip.cpp
index b4bd304bae..0bbcd01573 100644
--- a/src/AFQMC/Numerics/detail/HIP/Kernels/KaKjw_to_KKwaj.hip.cpp
+++ b/src/AFQMC/Numerics/detail/HIP/Kernels/KaKjw_to_KKwaj.hip.cpp
@@ -25,6 +25,7 @@ namespace kernels
 template<typename T, typename T2>
 __global__ void kernel_KaKjw_to_KKwaj(int nwalk,
                                       int nkpts,
+                                      int npol,
                                       int nmo_max,
                                       int nmo_tot,
                                       int nocc_max,
@@ -37,30 +38,32 @@ __global__ void kernel_KaKjw_to_KKwaj(int nwalk,
 {
   int Ka = blockIdx.x;
   int Kj = blockIdx.y;
-  if (Ka >= nkpts || Kj >= nkpts)
+  int pol = blockIdx.z;
+  if (Ka >= nkpts || Kj >= nkpts || pol >= npol)
     return;
   int na0 = nocc0[Ka];
   int nj0 = nmo0[Kj];
   int na  = nocc[Ka];
   int nj  = nmo[Kj];
 
-  T const* A_(A + (na0 * nmo_tot + nj0) * nwalk);
-  T2* B_(B + ((Ka * nkpts + Kj) * nocc_max) * nmo_max * nwalk);
+  T const* A_(A + (na0 * npol * nmo_tot + nj0) * nwalk);
+  T2* B_(B + ((Ka * nkpts + Kj) * nocc_max) * npol * nmo_max * nwalk);
 
   if (threadIdx.x >= nj)
     return;
   if (threadIdx.y >= nwalk)
     return;
 
-  for (int a = 0, a1 = 0; a < na; a++, a1 += nmo_tot * nwalk)
+  for (int a = 0, a1 = pol * nmo_tot * nwalk; a < na; a++, a1 += npol * nmo_tot * nwalk)
     for (int j = threadIdx.x; j < nj; j += blockDim.x)
       for (int n = threadIdx.y; n < nwalk; n += blockDim.y)
-        B_[n * nocc_max * nmo_max + a * nmo_max + j] = static_cast<T2>(A_[a1 + j * nwalk + n]);
+        B_[((n * nocc_max + a) * npol + pol) * nmo_max + j] = static_cast<T2>(A_[a1 + j * nwalk + n]);
 }
 
 template<typename T, typename T2>
 __global__ void kernel_KaKjw_to_KKwaj(int nwalk,
                                       int nkpts,
+                                      int npol,
                                       int nmo_max,
                                       int nmo_tot,
                                       int nocc_max,
@@ -73,30 +76,32 @@ __global__ void kernel_KaKjw_to_KKwaj(int nwalk,
 {
   int Ka = blockIdx.x;
   int Kj = blockIdx.y;
-  if (Ka >= nkpts || Kj >= nkpts)
+  int pol = blockIdx.z;
+  if (Ka >= nkpts || Kj >= nkpts || pol >= npol)
     return;
   int na0 = nocc0[Ka];
   int nj0 = nmo0[Kj];
   int na  = nocc[Ka];
   int nj  = nmo[Kj];
 
-  thrust::complex<T> const* A_(A + (na0 * nmo_tot + nj0) * nwalk);
-  thrust::complex<T2>* B_(B + ((Ka * nkpts + Kj) * nocc_max) * nmo_max * nwalk);
+  thrust::complex<T> const* A_(A + (na0 * npol * nmo_tot + nj0) * nwalk);
+  thrust::complex<T2>* B_(B + ((Ka * nkpts + Kj) * nocc_max) * npol * nmo_max * nwalk);
 
   if (threadIdx.x >= nj)
     return;
   if (threadIdx.y >= nwalk)
     return;
 
-  for (int a = 0, a1 = 0; a < na; a++, a1 += nmo_tot * nwalk)
+  for (int a = 0, a1 = pol * nmo_tot * nwalk; a < na; a++, a1 += npol * nmo_tot * nwalk)
     for (int j = threadIdx.x; j < nj; j += blockDim.x)
       for (int n = threadIdx.y; n < nwalk; n += blockDim.y)
-        B_[n * nocc_max * nmo_max + a * nmo_max + j] = static_cast<thrust::complex<T2>>(A_[a1 + j * nwalk + n]);
+        B_[((n * nocc_max + a) * npol + pol) * nmo_max + j] = static_cast<thrust::complex<T2>>(A_[a1 + j * nwalk + n]);
 }
 
 
 void KaKjw_to_KKwaj(int nwalk,
-                    int nkpts,
+                    int nkpts, 
+                    int npol, 
                     int nmo_max,
                     int nmo_tot,
                     int nocc_max,
@@ -110,8 +115,8 @@ void KaKjw_to_KKwaj(int nwalk,
   int xblock_dim = 16;
   int yblock_dim = std::min(nwalk, 32);
   dim3 block_dim(xblock_dim, yblock_dim, 1);
-  dim3 grid_dim(nkpts, nkpts, 1);
-  hipLaunchKernelGGL(kernel_KaKjw_to_KKwaj, dim3(grid_dim), dim3(block_dim), 0, 0, nwalk, nkpts, nmo_max, nmo_tot,
+  dim3 grid_dim(nkpts, nkpts, npol);
+  hipLaunchKernelGGL(kernel_KaKjw_to_KKwaj, dim3(grid_dim), dim3(block_dim), 0, 0, nwalk, nkpts, npol, nmo_max, nmo_tot,
                      nocc_max, nmo, nmo0, nocc, nocc0, A, B);
   qmc_hip::hip_check(hipGetLastError(), "KaKjw_to_KKwaj");
   qmc_hip::hip_check(hipDeviceSynchronize(), "KaKjw_to_KKwaj");
@@ -119,6 +124,7 @@ void KaKjw_to_KKwaj(int nwalk,
 
 void KaKjw_to_KKwaj(int nwalk,
                     int nkpts,
+                    int npol, 
                     int nmo_max,
                     int nmo_tot,
                     int nocc_max,
@@ -132,8 +138,8 @@ void KaKjw_to_KKwaj(int nwalk,
   int xblock_dim = 16;
   int yblock_dim = std::min(nwalk, 32);
   dim3 block_dim(xblock_dim, yblock_dim, 1);
-  dim3 grid_dim(nkpts, nkpts, 1);
-  hipLaunchKernelGGL(kernel_KaKjw_to_KKwaj, dim3(grid_dim), dim3(block_dim), 0, 0, nwalk, nkpts, nmo_max, nmo_tot,
+  dim3 grid_dim(nkpts, nkpts, npol);
+  hipLaunchKernelGGL(kernel_KaKjw_to_KKwaj, dim3(grid_dim), dim3(block_dim), 0, 0, nwalk, nkpts, npol, nmo_max, nmo_tot,
                      nocc_max, nmo, nmo0, nocc, nocc0, A, B);
   qmc_hip::hip_check(hipGetLastError(), "KaKjw_to_KKwaj");
   qmc_hip::hip_check(hipDeviceSynchronize(), "KaKjw_to_KKwaj");
@@ -141,6 +147,7 @@ void KaKjw_to_KKwaj(int nwalk,
 
 void KaKjw_to_KKwaj(int nwalk,
                     int nkpts,
+                    int npol, 
                     int nmo_max,
                     int nmo_tot,
                     int nocc_max,
@@ -154,8 +161,8 @@ void KaKjw_to_KKwaj(int nwalk,
   int xblock_dim = 16;
   int yblock_dim = std::min(nwalk, 32);
   dim3 block_dim(xblock_dim, yblock_dim, 1);
-  dim3 grid_dim(nkpts, nkpts, 1);
-  hipLaunchKernelGGL(kernel_KaKjw_to_KKwaj, dim3(grid_dim), dim3(block_dim), 0, 0, nwalk, nkpts, nmo_max, nmo_tot,
+  dim3 grid_dim(nkpts, nkpts, npol);
+  hipLaunchKernelGGL(kernel_KaKjw_to_KKwaj, dim3(grid_dim), dim3(block_dim), 0, 0, nwalk, nkpts, npol, nmo_max, nmo_tot,
                      nocc_max, nmo, nmo0, nocc, nocc0, A, B);
   qmc_hip::hip_check(hipGetLastError(), "KaKjw_to_KKwaj");
   qmc_hip::hip_check(hipDeviceSynchronize(), "KaKjw_to_KKwaj");
@@ -163,6 +170,7 @@ void KaKjw_to_KKwaj(int nwalk,
 
 void KaKjw_to_KKwaj(int nwalk,
                     int nkpts,
+                    int npol, 
                     int nmo_max,
                     int nmo_tot,
                     int nocc_max,
@@ -176,8 +184,8 @@ void KaKjw_to_KKwaj(int nwalk,
   int xblock_dim = 16;
   int yblock_dim = std::min(nwalk, 32);
   dim3 block_dim(xblock_dim, yblock_dim, 1);
-  dim3 grid_dim(nkpts, nkpts, 1);
-  hipLaunchKernelGGL(kernel_KaKjw_to_KKwaj, dim3(grid_dim), dim3(block_dim), 0, 0, nwalk, nkpts, nmo_max, nmo_tot,
+  dim3 grid_dim(nkpts, nkpts, npol);
+  hipLaunchKernelGGL(kernel_KaKjw_to_KKwaj, dim3(grid_dim), dim3(block_dim), 0, 0, nwalk, nkpts, npol, nmo_max, nmo_tot,
                      nocc_max, nmo, nmo0, nocc, nocc0, reinterpret_cast<thrust::complex<double> const*>(A),
                      reinterpret_cast<thrust::complex<double>*>(B));
   qmc_hip::hip_check(hipGetLastError(), "KaKjw_to_KKwaj");
@@ -186,6 +194,7 @@ void KaKjw_to_KKwaj(int nwalk,
 
 void KaKjw_to_KKwaj(int nwalk,
                     int nkpts,
+                    int npol, 
                     int nmo_max,
                     int nmo_tot,
                     int nocc_max,
@@ -199,8 +208,8 @@ void KaKjw_to_KKwaj(int nwalk,
   int xblock_dim = 16;
   int yblock_dim = std::min(nwalk, 32);
   dim3 block_dim(xblock_dim, yblock_dim, 1);
-  dim3 grid_dim(nkpts, nkpts, 1);
-  hipLaunchKernelGGL(kernel_KaKjw_to_KKwaj, dim3(grid_dim), dim3(block_dim), 0, 0, nwalk, nkpts, nmo_max, nmo_tot,
+  dim3 grid_dim(nkpts, nkpts, npol);
+  hipLaunchKernelGGL(kernel_KaKjw_to_KKwaj, dim3(grid_dim), dim3(block_dim), 0, 0, nwalk, nkpts, npol, nmo_max, nmo_tot,
                      nocc_max, nmo, nmo0, nocc, nocc0, reinterpret_cast<thrust::complex<float> const*>(A),
                      reinterpret_cast<thrust::complex<float>*>(B));
   qmc_hip::hip_check(hipGetLastError(), "KaKjw_to_KKwaj");
@@ -209,6 +218,7 @@ void KaKjw_to_KKwaj(int nwalk,
 
 void KaKjw_to_KKwaj(int nwalk,
                     int nkpts,
+                    int npol, 
                     int nmo_max,
                     int nmo_tot,
                     int nocc_max,
@@ -222,8 +232,8 @@ void KaKjw_to_KKwaj(int nwalk,
   int xblock_dim = 16;
   int yblock_dim = std::min(nwalk, 32);
   dim3 block_dim(xblock_dim, yblock_dim, 1);
-  dim3 grid_dim(nkpts, nkpts, 1);
-  hipLaunchKernelGGL(kernel_KaKjw_to_KKwaj, dim3(grid_dim), dim3(block_dim), 0, 0, nwalk, nkpts, nmo_max, nmo_tot,
+  dim3 grid_dim(nkpts, nkpts, npol);
+  hipLaunchKernelGGL(kernel_KaKjw_to_KKwaj, dim3(grid_dim), dim3(block_dim), 0, 0, nwalk, nkpts, npol, nmo_max, nmo_tot,
                      nocc_max, nmo, nmo0, nocc, nocc0, reinterpret_cast<thrust::complex<double> const*>(A),
                      reinterpret_cast<thrust::complex<float>*>(B));
   qmc_hip::hip_check(hipGetLastError(), "KaKjw_to_KKwaj");
diff --git a/src/AFQMC/Numerics/detail/HIP/Kernels/KaKjw_to_KKwaj.hip.h b/src/AFQMC/Numerics/detail/HIP/Kernels/KaKjw_to_KKwaj.hip.h
index 1fa43f4927..ebab86b49b 100644
--- a/src/AFQMC/Numerics/detail/HIP/Kernels/KaKjw_to_KKwaj.hip.h
+++ b/src/AFQMC/Numerics/detail/HIP/Kernels/KaKjw_to_KKwaj.hip.h
@@ -20,6 +20,7 @@ namespace kernels
 {
 void KaKjw_to_KKwaj(int nwalk,
                     int nkpts,
+                    int npol,
                     int nmo_max,
                     int nmo_tot,
                     int nocc_max,
@@ -31,6 +32,7 @@ void KaKjw_to_KKwaj(int nwalk,
                     double* B);
 void KaKjw_to_KKwaj(int nwalk,
                     int nkpts,
+                    int npol,
                     int nmo_max,
                     int nmo_tot,
                     int nocc_max,
@@ -42,6 +44,7 @@ void KaKjw_to_KKwaj(int nwalk,
                     float* B);
 void KaKjw_to_KKwaj(int nwalk,
                     int nkpts,
+                    int npol,
                     int nmo_max,
                     int nmo_tot,
                     int nocc_max,
@@ -53,6 +56,7 @@ void KaKjw_to_KKwaj(int nwalk,
                     float* B);
 void KaKjw_to_KKwaj(int nwalk,
                     int nkpts,
+                    int npol,
                     int nmo_max,
                     int nmo_tot,
                     int nocc_max,
@@ -64,6 +68,7 @@ void KaKjw_to_KKwaj(int nwalk,
                     std::complex<double>* B);
 void KaKjw_to_KKwaj(int nwalk,
                     int nkpts,
+                    int npol,
                     int nmo_max,
                     int nmo_tot,
                     int nocc_max,
@@ -75,6 +80,7 @@ void KaKjw_to_KKwaj(int nwalk,
                     std::complex<float>* B);
 void KaKjw_to_KKwaj(int nwalk,
                     int nkpts,
+                    int npol,
                     int nmo_max,
                     int nmo_tot,
                     int nocc_max,
diff --git a/src/AFQMC/Numerics/detail/HIP/Kernels/KaKjw_to_QKajw.hip.cpp b/src/AFQMC/Numerics/detail/HIP/Kernels/KaKjw_to_QKajw.hip.cpp
index 9b9ca77ccd..6fd46e0bc1 100644
--- a/src/AFQMC/Numerics/detail/HIP/Kernels/KaKjw_to_QKajw.hip.cpp
+++ b/src/AFQMC/Numerics/detail/HIP/Kernels/KaKjw_to_QKajw.hip.cpp
@@ -26,6 +26,7 @@ namespace kernels
 template<typename T, typename T2>
 __global__ void kernel_KaKjw_to_QKajw(int nwalk,
                                       int nkpts,
+                                      int npol,
                                       int nmo_max,
                                       int nmo_tot,
                                       int nocc_max,
@@ -39,7 +40,8 @@ __global__ void kernel_KaKjw_to_QKajw(int nwalk,
 {
   int Q = blockIdx.x;
   int K = blockIdx.y;
-  if (Q >= nkpts || K >= nkpts)
+  int pol = blockIdx.z;
+  if (Q >= nkpts || K >= nkpts || pol > npol)
     return;
   int QK  = QKtok2[Q * nkpts + K];
   int na0 = nocc0[K];
@@ -47,15 +49,16 @@ __global__ void kernel_KaKjw_to_QKajw(int nwalk,
   int na  = nocc[K];
   int nj  = nmo[QK];
 
-  T const* A_(A + (na0 * nmo_tot + nj0) * nwalk);
-  T2* B_(B + ((Q * nkpts + K) * nocc_max) * nmo_max * nwalk);
+  T const* A_(A + (na0 * npol * nmo_tot + nj0) * nwalk);
+  T2* B_(B + ((Q * nkpts + K) * nocc_max) * npol * nmo_max * nwalk);
 
   if (threadIdx.x >= nj)
     return;
   if (threadIdx.y >= nwalk)
     return;
 
-  for (int a = 0, a0 = 0, a1 = 0; a < na; a++, a0 += nmo_max * nwalk, a1 += nmo_tot * nwalk)
+  for (int a = 0, a0 = pol*nmo_max*nwalk, a1 = pol*nmo_tot*nwalk; 
+                a < na; a++, a0 += npol * nmo_max * nwalk, a1 += npol * nmo_tot * nwalk)
     for (int j = threadIdx.x; j < nj; j += blockDim.x)
       for (int n = threadIdx.y; n < nwalk; n += blockDim.y)
         B_[a0 + j * nwalk + n] = static_cast<T2>(A_[a1 + j * nwalk + n]);
@@ -64,6 +67,7 @@ __global__ void kernel_KaKjw_to_QKajw(int nwalk,
 template<typename T, typename T2>
 __global__ void kernel_KaKjw_to_QKajw(int nwalk,
                                       int nkpts,
+                                      int npol,
                                       int nmo_max,
                                       int nmo_tot,
                                       int nocc_max,
@@ -77,7 +81,8 @@ __global__ void kernel_KaKjw_to_QKajw(int nwalk,
 {
   int Q = blockIdx.x;
   int K = blockIdx.y;
-  if (Q >= nkpts || K >= nkpts)
+  int pol = blockIdx.z;
+  if (Q >= nkpts || K >= nkpts || pol > npol)
     return;
   int QK  = QKtok2[Q * nkpts + K];
   int na0 = nocc0[K];
@@ -85,15 +90,16 @@ __global__ void kernel_KaKjw_to_QKajw(int nwalk,
   int na  = nocc[K];
   int nj  = nmo[QK];
 
-  thrust::complex<T> const* A_(A + (na0 * nmo_tot + nj0) * nwalk);
-  thrust::complex<T2>* B_(B + ((Q * nkpts + K) * nocc_max) * nmo_max * nwalk);
+  thrust::complex<T> const* A_(A + (na0 * npol * nmo_tot + nj0) * nwalk);
+  thrust::complex<T2>* B_(B + ((Q * nkpts + K) * npol * nocc_max) * nmo_max * nwalk);
 
   if (threadIdx.x >= nj)
     return;
   if (threadIdx.y >= nwalk)
     return;
 
-  for (int a = 0, a0 = 0, a1 = 0; a < na; a++, a0 += nmo_max * nwalk, a1 += nmo_tot * nwalk)
+  for (int a = 0, a0 = pol*nmo_max*nwalk, a1 = pol*nmo_tot*nwalk; 
+            a < na; a++, a0 += npol * nmo_max * nwalk, a1 += npol * nmo_tot * nwalk)
   {
     for (int j = threadIdx.x; j < nj; j += blockDim.x)
       for (int n = threadIdx.y; n < nwalk; n += blockDim.y)
@@ -103,6 +109,7 @@ __global__ void kernel_KaKjw_to_QKajw(int nwalk,
 
 void KaKjw_to_QKajw(int nwalk,
                     int nkpts,
+                    int npol,
                     int nmo_max,
                     int nmo_tot,
                     int nocc_max,
@@ -117,8 +124,8 @@ void KaKjw_to_QKajw(int nwalk,
   int xblock_dim = 16;
   int yblock_dim = std::min(nwalk, 32);
   dim3 block_dim(xblock_dim, yblock_dim, 1);
-  dim3 grid_dim(nkpts, nkpts, 1);
-  hipLaunchKernelGGL(kernel_KaKjw_to_QKajw, dim3(grid_dim), dim3(block_dim), 0, 0, nwalk, nkpts, nmo_max, nmo_tot,
+  dim3 grid_dim(nkpts, nkpts, npol);
+  hipLaunchKernelGGL(kernel_KaKjw_to_QKajw, dim3(grid_dim), dim3(block_dim), 0, 0, nwalk, nkpts, npol, nmo_max, nmo_tot,
                      nocc_max, nmo, nmo0, nocc, nocc0, QKtok2, A, B);
   qmc_hip::hip_check(hipGetLastError(), "KaKjw_to_QKajw");
   qmc_hip::hip_check(hipDeviceSynchronize(), "KaKjw_to_QKajw");
@@ -126,6 +133,7 @@ void KaKjw_to_QKajw(int nwalk,
 
 void KaKjw_to_QKajw(int nwalk,
                     int nkpts,
+                    int npol,
                     int nmo_max,
                     int nmo_tot,
                     int nocc_max,
@@ -140,8 +148,8 @@ void KaKjw_to_QKajw(int nwalk,
   int xblock_dim = 16;
   int yblock_dim = std::min(nwalk, 32);
   dim3 block_dim(xblock_dim, yblock_dim, 1);
-  dim3 grid_dim(nkpts, nkpts, 1);
-  hipLaunchKernelGGL(kernel_KaKjw_to_QKajw, dim3(grid_dim), dim3(block_dim), 0, 0, nwalk, nkpts, nmo_max, nmo_tot,
+  dim3 grid_dim(nkpts, nkpts, npol);
+  hipLaunchKernelGGL(kernel_KaKjw_to_QKajw, dim3(grid_dim), dim3(block_dim), 0, 0, nwalk, nkpts, npol, nmo_max, nmo_tot,
                      nocc_max, nmo, nmo0, nocc, nocc0, QKtok2, A, B);
   qmc_hip::hip_check(hipGetLastError(), "KaKjw_to_QKajw");
   qmc_hip::hip_check(hipDeviceSynchronize(), "KaKjw_to_QKajw");
@@ -149,6 +157,7 @@ void KaKjw_to_QKajw(int nwalk,
 
 void KaKjw_to_QKajw(int nwalk,
                     int nkpts,
+                    int npol,
                     int nmo_max,
                     int nmo_tot,
                     int nocc_max,
@@ -163,8 +172,8 @@ void KaKjw_to_QKajw(int nwalk,
   int xblock_dim = 16;
   int yblock_dim = std::min(nwalk, 32);
   dim3 block_dim(xblock_dim, yblock_dim, 1);
-  dim3 grid_dim(nkpts, nkpts, 1);
-  hipLaunchKernelGGL(kernel_KaKjw_to_QKajw, dim3(grid_dim), dim3(block_dim), 0, 0, nwalk, nkpts, nmo_max, nmo_tot,
+  dim3 grid_dim(nkpts, nkpts, npol);
+  hipLaunchKernelGGL(kernel_KaKjw_to_QKajw, dim3(grid_dim), dim3(block_dim), 0, 0, nwalk, nkpts, npol, nmo_max, nmo_tot,
                      nocc_max, nmo, nmo0, nocc, nocc0, QKtok2, A, B);
   qmc_hip::hip_check(hipGetLastError(), "KaKjw_to_QKajw");
   qmc_hip::hip_check(hipDeviceSynchronize(), "KaKjw_to_QKajw");
@@ -172,6 +181,7 @@ void KaKjw_to_QKajw(int nwalk,
 
 void KaKjw_to_QKajw(int nwalk,
                     int nkpts,
+                    int npol,
                     int nmo_max,
                     int nmo_tot,
                     int nocc_max,
@@ -186,8 +196,8 @@ void KaKjw_to_QKajw(int nwalk,
   int xblock_dim = 16;
   int yblock_dim = std::min(nwalk, 32);
   dim3 block_dim(xblock_dim, yblock_dim, 1);
-  dim3 grid_dim(nkpts, nkpts, 1);
-  hipLaunchKernelGGL(kernel_KaKjw_to_QKajw, dim3(grid_dim), dim3(block_dim), 0, 0, nwalk, nkpts, nmo_max, nmo_tot,
+  dim3 grid_dim(nkpts, nkpts, npol);
+  hipLaunchKernelGGL(kernel_KaKjw_to_QKajw, dim3(grid_dim), dim3(block_dim), 0, 0, nwalk, nkpts, npol, nmo_max, nmo_tot,
                      nocc_max, nmo, nmo0, nocc, nocc0, QKtok2, reinterpret_cast<thrust::complex<float> const*>(A),
                      reinterpret_cast<thrust::complex<float>*>(B));
   qmc_hip::hip_check(hipGetLastError(), "KaKjw_to_QKajw");
@@ -196,6 +206,7 @@ void KaKjw_to_QKajw(int nwalk,
 
 void KaKjw_to_QKajw(int nwalk,
                     int nkpts,
+                    int npol,
                     int nmo_max,
                     int nmo_tot,
                     int nocc_max,
@@ -210,8 +221,8 @@ void KaKjw_to_QKajw(int nwalk,
   int xblock_dim = 16;
   int yblock_dim = std::min(nwalk, 32);
   dim3 block_dim(xblock_dim, yblock_dim, 1);
-  dim3 grid_dim(nkpts, nkpts, 1);
-  hipLaunchKernelGGL(kernel_KaKjw_to_QKajw, dim3(grid_dim), dim3(block_dim), 0, 0, nwalk, nkpts, nmo_max, nmo_tot,
+  dim3 grid_dim(nkpts, nkpts, npol);
+  hipLaunchKernelGGL(kernel_KaKjw_to_QKajw, dim3(grid_dim), dim3(block_dim), 0, 0, nwalk, nkpts, npol, nmo_max, nmo_tot,
                      nocc_max, nmo, nmo0, nocc, nocc0, QKtok2, reinterpret_cast<thrust::complex<double> const*>(A),
                      reinterpret_cast<thrust::complex<double>*>(B));
   qmc_hip::hip_check(hipGetLastError(), "KaKjw_to_QKajw");
@@ -220,6 +231,7 @@ void KaKjw_to_QKajw(int nwalk,
 
 void KaKjw_to_QKajw(int nwalk,
                     int nkpts,
+                    int npol,
                     int nmo_max,
                     int nmo_tot,
                     int nocc_max,
@@ -234,8 +246,8 @@ void KaKjw_to_QKajw(int nwalk,
   int xblock_dim = 16;
   int yblock_dim = std::min(nwalk, 32);
   dim3 block_dim(xblock_dim, yblock_dim, 1);
-  dim3 grid_dim(nkpts, nkpts, 1);
-  hipLaunchKernelGGL(kernel_KaKjw_to_QKajw, dim3(grid_dim), dim3(block_dim), 0, 0, nwalk, nkpts, nmo_max, nmo_tot,
+  dim3 grid_dim(nkpts, nkpts, npol);
+  hipLaunchKernelGGL(kernel_KaKjw_to_QKajw, dim3(grid_dim), dim3(block_dim), 0, 0, nwalk, nkpts, npol, nmo_max, nmo_tot,
                      nocc_max, nmo, nmo0, nocc, nocc0, QKtok2, reinterpret_cast<thrust::complex<double> const*>(A),
                      reinterpret_cast<thrust::complex<float>*>(B));
   qmc_hip::hip_check(hipGetLastError(), "KaKjw_to_QKajw");
diff --git a/src/AFQMC/Numerics/detail/HIP/Kernels/KaKjw_to_QKajw.hip.h b/src/AFQMC/Numerics/detail/HIP/Kernels/KaKjw_to_QKajw.hip.h
index e773dce130..c8b84a0219 100644
--- a/src/AFQMC/Numerics/detail/HIP/Kernels/KaKjw_to_QKajw.hip.h
+++ b/src/AFQMC/Numerics/detail/HIP/Kernels/KaKjw_to_QKajw.hip.h
@@ -20,6 +20,7 @@ namespace kernels
 {
 void KaKjw_to_QKajw(int nwalk,
                     int nkpts,
+                    int npol,
                     int nmo_max,
                     int nmo_tot,
                     int nocc_max,
@@ -32,6 +33,7 @@ void KaKjw_to_QKajw(int nwalk,
                     double* B);
 void KaKjw_to_QKajw(int nwalk,
                     int nkpts,
+                    int npol,
                     int nmo_max,
                     int nmo_tot,
                     int nocc_max,
@@ -44,6 +46,7 @@ void KaKjw_to_QKajw(int nwalk,
                     float* B);
 void KaKjw_to_QKajw(int nwalk,
                     int nkpts,
+                    int npol,
                     int nmo_max,
                     int nmo_tot,
                     int nocc_max,
@@ -56,6 +59,7 @@ void KaKjw_to_QKajw(int nwalk,
                     float* B);
 void KaKjw_to_QKajw(int nwalk,
                     int nkpts,
+                    int npol,
                     int nmo_max,
                     int nmo_tot,
                     int nocc_max,
@@ -68,6 +72,7 @@ void KaKjw_to_QKajw(int nwalk,
                     std::complex<float>* B);
 void KaKjw_to_QKajw(int nwalk,
                     int nkpts,
+                    int npol,
                     int nmo_max,
                     int nmo_tot,
                     int nocc_max,
@@ -80,6 +85,7 @@ void KaKjw_to_QKajw(int nwalk,
                     std::complex<double>* B);
 void KaKjw_to_QKajw(int nwalk,
                     int nkpts,
+                    int npol,
                     int nmo_max,
                     int nmo_tot,
                     int nocc_max,
diff --git a/src/AFQMC/Numerics/tensor_operations.hpp b/src/AFQMC/Numerics/tensor_operations.hpp
index f4ea13191d..e93757a64b 100644
--- a/src/AFQMC/Numerics/tensor_operations.hpp
+++ b/src/AFQMC/Numerics/tensor_operations.hpp
@@ -29,6 +29,7 @@ namespace ma
 template<typename Q, typename T>
 void KaKjw_to_KKwaj(int nwalk,
                     int nkpts,
+                    int npol,
                     int nmo_max,
                     int nmo_tot,
                     int nocc_max,
@@ -40,7 +41,7 @@ void KaKjw_to_KKwaj(int nwalk,
                     T* B)
 {
   // OpenMP: Combine Ka,Kj loops into single loop and call parallel for
-  int naj = nocc_max * nmo_max;
+  int napj = nocc_max * npol * nmo_max;
   int na0 = 0;
   for (int Ka = 0; Ka < nkpts; Ka++)
   {
@@ -50,16 +51,19 @@ void KaKjw_to_KKwaj(int nwalk,
     {
       int nj = nopk[Kj];
       //auto G_(to_address(GKK[0][Ka][Kj].origin()));
-      auto G_(B + (Ka * nkpts + Kj) * nwalk * nocc_max * nmo_max);
+      auto G_(B + (Ka * nkpts + Kj) * nwalk * nocc_max * npol * nmo_max);
       for (int a = 0; a < na; a++)
       {
-        //auto Gc_( to_address(Gca[na0+a][nj0].origin()) );
-        auto Gc_(A + (na0 + a) * nmo_tot * nwalk + nj0 * nwalk);
-        int aj = a * nmo_max;
-        for (int j = 0; j < nj; j++, aj++)
+        //auto Gc_( to_address(Gca[na0+a][p][nj0].origin()) );
+        int apj = a * npol * nmo_max;
+        for (int p = 0; p < npol; p++)
         {
-          for (int w = 0, waj = 0; w < nwalk; w++, ++Gc_, waj += naj)
-            G_[waj + aj] = static_cast<T>(*Gc_);
+          auto Gc_(A + ((na0 + a) * npol + p ) * nmo_tot * nwalk + nj0 * nwalk);
+          for (int j = 0; j < nj; j++, apj++)
+          {
+            for (int w = 0, wapj = 0; w < nwalk; w++, ++Gc_, wapj += napj)
+              G_[wapj + apj] = static_cast<T>(*Gc_);
+          }
         }
       }
       nj0 += nj;
@@ -71,6 +75,7 @@ void KaKjw_to_KKwaj(int nwalk,
 template<typename T, typename T1>
 void KaKjw_to_QKajw(int nwalk,
                     int nkpts,
+                    int npol,
                     int nmo_max,
                     int nmo_tot,
                     int nocc_max,
@@ -94,16 +99,19 @@ void KaKjw_to_QKajw(int nwalk,
       int na0 = nocc0[Ka];
       int nj0 = nmo0[Kj];
       //auto G_(to_address(GKK[Q][K].origin()));
-      auto G_(B + (Q * nkpts + K) * nwalk * nocc_max * nmo_max);
-      for (int a = 0, a0 = 0; a < na; a++, a0 += nmo_max * nwalk)
+      auto G_(B + (Q * nkpts + K) * nwalk * nocc_max * npol * nmo_max);
+      for (int a = 0, a0 = 0; a < na; a++)
       {
-        //auto Gc_( to_address(Gca[na0+a][nj0].origin()) );
-        auto Gc_(A + (na0 + a) * nmo_tot * nwalk + nj0 * nwalk);
-        for (int j = 0, aj = a0; j < nj; j++, aj += nwalk)
+        for (int p = 0; p < npol; p++, a0 += nmo_max * nwalk)
         {
-          for (int w = 0; w < nwalk; w++, ++Gc_)
+          //auto Gc_( to_address(Gca[na0+a][p][nj0].origin()) );
+          auto Gc_(A + ((na0 + a) * npol + p) * nmo_tot * nwalk + nj0 * nwalk);
+          for (int j = 0, apj = a0; j < nj; j++, apj += nwalk)
           {
-            G_[aj + w] = static_cast<T>(*Gc_);
+            for (int w = 0; w < nwalk; w++, ++Gc_)
+            {
+              G_[apj + w] = static_cast<T>(*Gc_);
+            }
           }
         }
       }
@@ -275,6 +283,7 @@ namespace device
 template<typename T, typename Q>
 void KaKjw_to_KKwaj(int nwalk,
                     int nkpts,
+                    int npol,
                     int nmo_max,
                     int nmo_tot,
                     int nocc_max,
@@ -285,13 +294,14 @@ void KaKjw_to_KKwaj(int nwalk,
                     device_pointer<Q> A,
                     device_pointer<T> B)
 {
-  kernels::KaKjw_to_KKwaj(nwalk, nkpts, nmo_max, nmo_tot, nocc_max, to_address(nmo), to_address(nmo0), to_address(nocc),
+  kernels::KaKjw_to_KKwaj(nwalk, nkpts, npol, nmo_max, nmo_tot, nocc_max, to_address(nmo), to_address(nmo0), to_address(nocc),
                           to_address(nocc0), to_address(A), to_address(B));
 }
 
 template<typename T, typename Q>
 void KaKjw_to_QKajw(int nwalk,
                     int nkpts,
+                    int npol,
                     int nmo_max,
                     int nmo_tot,
                     int nocc_max,
@@ -303,7 +313,7 @@ void KaKjw_to_QKajw(int nwalk,
                     device_pointer<Q> A,
                     device_pointer<T> B)
 {
-  kernels::KaKjw_to_QKajw(nwalk, nkpts, nmo_max, nmo_tot, nocc_max, to_address(nmo), to_address(nmo0), to_address(nocc),
+  kernels::KaKjw_to_QKajw(nwalk, nkpts, npol, nmo_max, nmo_tot, nocc_max, to_address(nmo), to_address(nmo0), to_address(nocc),
                           to_address(nocc0), to_address(QKtok2), to_address(A), to_address(B));
 }
 
diff --git a/src/AFQMC/Numerics/tests/test_tensor_operations.cpp b/src/AFQMC/Numerics/tests/test_tensor_operations.cpp
index 8b54c41241..b527aee87d 100644
--- a/src/AFQMC/Numerics/tests/test_tensor_operations.cpp
+++ b/src/AFQMC/Numerics/tests/test_tensor_operations.cpp
@@ -100,7 +100,7 @@ TEST_CASE("KaKjw_to_KKwaj", "[Numerics][tensor_operations]")
   // KaKjw = numpy.arange(nk*na*nk*nj*nw).reshape((nk,na,nk,nj,nw))
   // KKwaj = numpy.transpose(KaKjw, (0,2,1,3,4))
   using ma::KaKjw_to_KKwaj;
-  KaKjw_to_KKwaj(nwalk, nk, nmo_k, nk * nmo_k, occ_k, nmo_pk.origin(), nmo_pk0.origin(), nel_pk.origin(),
+  KaKjw_to_KKwaj(nwalk, nk, 1, nmo_k, nk * nmo_k, occ_k, nmo_pk.origin(), nmo_pk0.origin(), nel_pk.origin(),
                  nel_pk0.origin(), KaKjw.origin(), KKwaj.origin());
   // Reference values from python
   REQUIRE(KKwaj[1][2][3][4][4] == Approx(1473.0));
@@ -127,7 +127,7 @@ TEST_CASE("KaKjw_to_QKwaj", "[Numerics][tensor_operations]")
   Tensor2D<int> qk_to_k2 = {{0, 1, 2, 3}, {1, 0, 3, 2}, {2, 3, 0, 1}, {3, 2, 1, 0}};
   copy_n(buffer.data(), buffer.size(), KaKjw.origin());
   using ma::KaKjw_to_QKajw;
-  KaKjw_to_QKajw(nwalk, nk, nmo_k, nk * nmo_k, occ_k, nmo_pk.origin(), nmo_pk0.origin(), nel_pk.origin(),
+  KaKjw_to_QKajw(nwalk, nk, 1, nmo_k, nk * nmo_k, occ_k, nmo_pk.origin(), nmo_pk0.origin(), nel_pk.origin(),
                  nel_pk0.origin(), qk_to_k2.origin(), KaKjw.origin(), QKajw.origin());
   // Just captured from output.
   REQUIRE(QKajw[1][2][3][4][4] == Approx(1904.0));
diff --git a/src/AFQMC/SlaterDeterminantOperations/SlaterDetOperations_serial.hpp b/src/AFQMC/SlaterDeterminantOperations/SlaterDetOperations_serial.hpp
index 73ff2e0ba6..bc43967aaa 100644
--- a/src/AFQMC/SlaterDeterminantOperations/SlaterDetOperations_serial.hpp
+++ b/src/AFQMC/SlaterDeterminantOperations/SlaterDetOperations_serial.hpp
@@ -180,7 +180,7 @@ class SlaterDetOperations_serial : public SlaterDetOperations_base<Type, buffer_
       TTensor_ref TMN_( TMN.origin(), {npol*nbatch, M, NAEA}); 
       TTensor_ref T1_( T1.origin(), {npol*nbatch, M, NAEA}); 
       TTensor_ref T2_( T2.origin(), {npol*nbatch, M, NAEA}); 
-      SlaterDeterminantOperations::batched::apply_expM(V, TMN_, T1_, T2_, order, TA);
+      SlaterDeterminantOperations::batched::apply_expM_noncollinear(V, TMN_, T1_, T2_, order, TA);
     } else {
       SlaterDeterminantOperations::batched::apply_expM(V, TMN, T1, T2, order, TA);
     } 
diff --git a/src/AFQMC/SlaterDeterminantOperations/apply_expM.hpp b/src/AFQMC/SlaterDeterminantOperations/apply_expM.hpp
index ea1dd664b5..8a706c8116 100644
--- a/src/AFQMC/SlaterDeterminantOperations/apply_expM.hpp
+++ b/src/AFQMC/SlaterDeterminantOperations/apply_expM.hpp
@@ -188,6 +188,90 @@ inline void apply_expM(const MatA& V, MatB&& S, MatC& T1, MatC& T2,
   }
 }
 
+/*
+ * Calculate S = exp(im*V)*S using a Taylor expansion of exp(V)
+ * Version for non_collinear calculations, where there are 2 S matrices per V in the batch.
+ */
+template<class MatA, class MatB, class MatC>
+inline void apply_expM_noncollinear(const MatA& V, MatB&& S, MatC& T1, MatC& T2, 
+                       int order = 6, char TA = 'N')
+{
+  static_assert(std::decay<MatA>::type::dimensionality == 3, " batched::apply_expM::dimenionality == 3");
+  static_assert(std::decay<MatB>::type::dimensionality == 3, " batched::apply_expM::dimenionality == 3");
+  static_assert(std::decay<MatC>::type::dimensionality == 3, " batched::apply_expM::dimenionality == 3");
+  assert(V.size(0)*2 == S.size(0)); 
+  assert(V.size(0)*2 == T1.size(0)); 
+  assert(V.size(0)*2 == T2.size(0));
+  assert(V.size(1) == V.size(2));
+  assert(V.size(2) == S.size(1));
+  assert(S.size(1) == T1.size(1));
+  assert(S.size(2) == T1.size(2));
+  assert(S.size(1) == T2.size(1));
+  assert(S.size(2) == T2.size(2));
+  // for now limit to continuous
+  assert(S.stride(0) == S.size(1) * S.size(2));
+  assert(T1.stride(0) == T1.size(1) * T1.size(2));
+  assert(T2.stride(0) == T2.size(1) * T2.size(2));
+  assert(S.stride(1) == S.size(2));
+  assert(T1.stride(1) == T1.size(2));
+  assert(T2.stride(1) == T2.size(2));
+  assert(S.stride(2) == 1);
+  assert(T1.stride(2) == 1);
+  assert(T2.stride(2) == 1);
+
+  using ComplexType = typename std::decay<MatB>::type::element;
+  ComplexType zero(0.);
+  ComplexType im(0.0, 1.0);
+  if (TA == 'H' || TA == 'h')
+    im = ComplexType(0.0, -1.0);
+  auto pT1(std::addressof(T1));
+  auto pT2(std::addressof(T2));
+
+  using pointerA = typename std::decay<MatA>::type::element_const_ptr; 
+  using pointerC = typename std::decay<MatC>::type::element_ptr; 
+
+  int nbatch = S.size(0);
+  int ldv = V.stride(1); 
+  int M = T2.size(2); 
+  int N = T2.size(1);
+  int K = T1.size(1);
+
+  std::vector<pointerA> Vi;
+  std::vector<pointerC> T1i;
+  std::vector<pointerC> T2i;
+  Vi.reserve(2*V.size(0));  
+  T1i.reserve(T1.size(0));  
+  T2i.reserve(T2.size(0));  
+  for (int i = 0; i < V.size(0); i++) { 
+    Vi.emplace_back(ma::pointer_dispatch(V[i].origin()));
+    Vi.emplace_back(ma::pointer_dispatch(V[i].origin()));
+  }  
+  for (int i = 0; i < T1.size(0); i++)
+    T1i.emplace_back(ma::pointer_dispatch(T1[i].origin()));
+  for (int i = 0; i < T2.size(0); i++)
+    T2i.emplace_back(ma::pointer_dispatch(T2[i].origin()));    
+
+  auto pT1i(std::addressof(T1i));
+  auto pT2i(std::addressof(T2i));
+
+  // getting around issue in multi, fix later
+  //T1 = S;
+  using std::copy_n;
+  copy_n(S.origin(), S.num_elements(), T1.origin());
+  for (int n = 1; n <= order; n++)
+  {
+    ComplexType fact = im * static_cast<ComplexType>(1.0 / static_cast<double>(n));
+    using ma::gemmBatched;
+    // careful with fortran ordering
+    gemmBatched('N',TA,M,N,K,fact, pT1i->data(), (*pT1).stride(1), Vi.data(), ldv, 
+                             zero, pT2i->data(), (*pT2).stride(1), nbatch);
+    using ma::axpy;
+    axpy(S.num_elements(), ComplexType(1.0), (*pT2).origin(), 1, S.origin(), 1);
+    std::swap(pT1, pT2);
+    std::swap(pT1i, pT2i);
+  }
+}
+
 } // namespace batched
 
 } // namespace SlaterDeterminantOperations
diff --git a/src/AFQMC/SlaterDeterminantOperations/rotate.hpp b/src/AFQMC/SlaterDeterminantOperations/rotate.hpp
index cdf85807d4..2c04324eb8 100644
--- a/src/AFQMC/SlaterDeterminantOperations/rotate.hpp
+++ b/src/AFQMC/SlaterDeterminantOperations/rotate.hpp
@@ -779,7 +779,7 @@ void getLakn_Lank_from_Lkin(MultiArray2DA&& Aai,
   // Lakn[a][sk][n] = sum_i Aai[as][i] conj(Lkin[k][i][n])
   for (int k = 0; k < nmo; k++)
   {
-    ma::product(ma::H(Lkin[k].sliced(0, ni)), ma::T(Aai), bnas);
+    ma::product(ma::H(Lkin[k].sliced(0, ni)), ma::T(Aas_i), bnas);
     for (int a = 0; a < na; a++)
       for (int p = 0; p < npol; p++)
         Lakn[a][p*nmo+k] = bnas({0, nchol}, a*npol+p);

From f86d1df71f0c887cb576c69ff70e6c58a7bf3125 Mon Sep 17 00:00:00 2001
From: mmorale3 <miguel.mmorales@gmail.com>
Date: Wed, 16 Sep 2020 15:58:28 -0700
Subject: [PATCH 08/10] small bug fix on single precision gpu build

---
 .../HamiltonianOperations/KP3IndexFactorization_batched.hpp     | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/AFQMC/HamiltonianOperations/KP3IndexFactorization_batched.hpp b/src/AFQMC/HamiltonianOperations/KP3IndexFactorization_batched.hpp
index 5d20c41780..bbdb453dee 100644
--- a/src/AFQMC/HamiltonianOperations/KP3IndexFactorization_batched.hpp
+++ b/src/AFQMC/HamiltonianOperations/KP3IndexFactorization_batched.hpp
@@ -571,7 +571,7 @@ class KP3IndexFactorization_batched
         C3Tensor_ref haj_K(make_device_ptr(haj[nd * nkpts + K].origin()), {nocc_max, npol, nmo_max});
         for (int a = 0; a < nelpk[nd][K]; ++a)
           for (int pol = 0; pol < npol; ++pol)
-            ma::product(ComplexType(1.), ma::T(G3Da[(na + a)*npol+p].sliced(nk, nk + nopk[K])), 
+            ma::product(ComplexType(1.), ma::T(G3Da[(na + a)*npol+pol].sliced(nk, nk + nopk[K])), 
                         haj_K[a][pol].sliced(0, nopk[K]), ComplexType(1.), E({0, nwalk}, 0));
         na += nelpk[nd][K];
         if (walker_type == COLLINEAR)

From 288629efdcbcf6167edb13eb48c95edf1917a9ac Mon Sep 17 00:00:00 2001
From: mmorale3 <miguel.mmorales@gmail.com>
Date: Tue, 29 Sep 2020 18:25:32 -0700
Subject: [PATCH 09/10] fix small bug with padding of H1 in GPU

---
 .../KP3IndexFactorization_batched.hpp               | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/src/AFQMC/HamiltonianOperations/KP3IndexFactorization_batched.hpp b/src/AFQMC/HamiltonianOperations/KP3IndexFactorization_batched.hpp
index bbdb453dee..a23b02a3ac 100644
--- a/src/AFQMC/HamiltonianOperations/KP3IndexFactorization_batched.hpp
+++ b/src/AFQMC/HamiltonianOperations/KP3IndexFactorization_batched.hpp
@@ -568,22 +568,23 @@ class KP3IndexFactorization_batched
       for (int K = 0; K < nkpts; ++K)
       {
 #if defined(MIXED_PRECISION)
-        C3Tensor_ref haj_K(make_device_ptr(haj[nd * nkpts + K].origin()), {nocc_max, npol, nmo_max});
-        for (int a = 0; a < nelpk[nd][K]; ++a)
+        int ni(nopk[K]);
+        CMatrix_ref haj_K(make_device_ptr(haj[nd * nkpts + K].origin()), {nocc_max, npol*nmo_max});
+        for (int a = 0; a < nelpk[nd][K]; ++a) 
           for (int pol = 0; pol < npol; ++pol)
-            ma::product(ComplexType(1.), ma::T(G3Da[(na + a)*npol+pol].sliced(nk, nk + nopk[K])), 
-                        haj_K[a][pol].sliced(0, nopk[K]), ComplexType(1.), E({0, nwalk}, 0));
+            ma::product(ComplexType(1.), ma::T(G3Da[(na + a)*npol+pol].sliced(nk, nk + ni)), 
+                        haj_K[a].sliced(pol*ni, pol*ni+ni), ComplexType(1.), E({0, nwalk}, 0));
         na += nelpk[nd][K];
         if (walker_type == COLLINEAR)
         {
           boost::multi::array_ref<ComplexType, 2, pointer> haj_Kb(haj_K.origin() + haj_K.num_elements(),
                                                                   {nocc_max, nmo_max});
           for (int b = 0; b < nelpk[nd][nkpts + K]; ++b)
-            ma::product(ComplexType(1.), ma::T(G3Db[nb + b].sliced(nk, nk + nopk[K])), haj_Kb[b].sliced(0, nopk[K]),
+            ma::product(ComplexType(1.), ma::T(G3Db[nb + b].sliced(nk, nk + ni)), haj_Kb[b].sliced(0, ni),
                         ComplexType(1.), E({0, nwalk}, 0));
           nb += nelpk[nd][nkpts + K];
         }
-        nk += nopk[K];
+        nk += ni; 
 #else
         nk = nopk[K];
         {

From 05fb232c11456e459a8595b8ab7847cb705c7dd3 Mon Sep 17 00:00:00 2001
From: mmorale3 <miguel.mmorales@gmail.com>
Date: Fri, 2 Oct 2020 11:11:08 -0700
Subject: [PATCH 10/10] small bug fix

---
 .../HamiltonianOperations/tests/test_hamiltonian_operations.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/AFQMC/HamiltonianOperations/tests/test_hamiltonian_operations.cpp b/src/AFQMC/HamiltonianOperations/tests/test_hamiltonian_operations.cpp
index 066d3290d2..8adf8c9f00 100644
--- a/src/AFQMC/HamiltonianOperations/tests/test_hamiltonian_operations.cpp
+++ b/src/AFQMC/HamiltonianOperations/tests/test_hamiltonian_operations.cpp
@@ -247,7 +247,7 @@ void ham_ops_basic_serial(boost::mpi3::communicator& world)
     ComplexType Xsum = 0, Xsum2=0;
     for(int i=0; i<X.size(0); i++) {
       Xsum += X[i][0];
-      Xsum2 += 0.5*X[i][0]*X[i][0];
+      Xsum2 += ComplexType(0.5)*X[i][0]*X[i][0];
     }
     if(std::abs(file_data.Xsum)>1e-8) {
       REQUIRE( real(Xsum) == Approx(real(file_data.Xsum)) );