Skip to content

Commit

Permalink
Merge pull request #4379 from ye-luo/fix-icpx
Browse files Browse the repository at this point in the history
Protect complex reduction in real builds.
  • Loading branch information
ye-luo authored Jan 4, 2023
2 parents 5c71b89 + 19d8332 commit 0e75f5c
Show file tree
Hide file tree
Showing 3 changed files with 12 additions and 6 deletions.
6 changes: 3 additions & 3 deletions src/QMCWaveFunctions/Fermion/DiracDeterminantBatched.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -216,9 +216,9 @@ void DiracDeterminantBatched<DET_ENGINE>::mw_evalGradWithSpin(
mw_dspin.resize(nw, num_orbitals);

//Here, we are just always recomputing the spin gradients from the SPOSet for simplicity.
//If we stored and modified the accept/reject to include updating stored spin gradients, we could the
//mw_evaluateVGLWithSpin call below and just use the stored spin gradients.
//May revisit this in the future.
//If we stored and modified the accept/reject to include updating stored spin gradients, we could the
//mw_evaluateVGLWithSpin call below and just use the stored spin gradients.
//May revisit this in the future.
RefVectorWithLeader<SPOSet> phi_list(*Phi);
RefVector<SPOSet::ValueVector> psi_v_list, lap_v_list;
RefVector<SPOSet::GradVector> grad_v_list;
Expand Down
2 changes: 1 addition & 1 deletion src/QMCWaveFunctions/Fermion/MatrixDelayedUpdateCUDA.h
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ class MatrixDelayedUpdateCUDA
template<typename DT>
using OffloadMWVGLArray = Array<DT, 3, OffloadPinnedAllocator<DT>>; // [VGL, walker, Orbs]
template<typename DT>
using OffloadMatrix = Matrix<DT, OffloadPinnedAllocator<DT>>;
using OffloadMatrix = Matrix<DT, OffloadPinnedAllocator<DT>>;

struct MatrixDelayedUpdateCUDAMultiWalkerMem : public Resource
{
Expand Down
10 changes: 8 additions & 2 deletions src/QMCWaveFunctions/Fermion/MatrixUpdateOMPTarget.h
Original file line number Diff line number Diff line change
Expand Up @@ -216,7 +216,7 @@ class MatrixUpdateOMPTarget
auto& spingrads_value_v = engine_leader.mw_mem_->spingrads_value_v;

//Need to pack these into a transfer buffer since psiMinv and dpsiM_row_list are not multiwalker data
//i.e. each engine has its own psiMinv which is an OffloadMatrix instead of the leader having the data for all the walkers in the crowd.
//i.e. each engine has its own psiMinv which is an OffloadMatrix instead of the leader having the data for all the walkers in the crowd.
//Wouldn't have to do this if dpsiM and psiMinv were part of the mw_mem_ with data across all walkers in the crowd and could just use use_device_ptr for the offload.
//That is how mw_dspin is handled below
const int norb = engine_leader.get_psiMinv().rows();
Expand All @@ -241,7 +241,7 @@ class MatrixUpdateOMPTarget
//Note that mw_dspin should already be in sync between device and host...updateTo was called in
//SPOSet::mw_evaluateVGLWithSpin to sync
//Also note that since mw_dspin is Dual, I can just use mw_dpsin.data() above and then use directly inside
//then offload region. OMP will figure out the correct translation to the device address, i.e. no
//then offload region. OMP will figure out the correct translation to the device address, i.e. no
//need to include in the PRAGMA_OFFLOAD below
PRAGMA_OFFLOAD("omp target teams distribute num_teams(nw) \
map(always, to: buffer_H2D_ptr[:buffer_H2D.size()]) \
Expand All @@ -253,7 +253,13 @@ class MatrixUpdateOMPTarget
const Value* __restrict__ dpsiM_row_ptr = reinterpret_cast<const Value**>(buffer_H2D_ptr)[nw + iw];
Value grad_x(0), grad_y(0), grad_z(0);
Complex spingrad(0);
#if defined(QMC_COMPLEX)
// This was causing a llvm-link error in icpx due to the lack of declare reduction on complex datatypes.
// Keep real builds free of any reduction on a complex datatype. Just serialize the reduction.
// Because mw_evalGradWithSpin is only being called in complex builds in simulations, the impact of this workaround is basically zero.
// It is still beneficial to keep it functional in real builds.
PRAGMA_OFFLOAD("omp parallel for reduction(+: grad_x, grad_y, grad_z, spingrad)")
#endif
for (int iorb = 0; iorb < norb; iorb++)
{
grad_x += invRow_ptr[iorb] * dpsiM_row_ptr[iorb * DIM];
Expand Down

0 comments on commit 0e75f5c

Please sign in to comment.