Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

GPU CUDA delayed updates #1279

Merged
merged 33 commits into from
Feb 7, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
33 commits
Select commit Hold shift + click to select a range
7ca8750
First implementation of delayed updates into VMC_CUDA. Both w/ and w/…
Dec 22, 2017
85af24f
Compile fix for GCC on SummitDev/Summit and optimization of calc_lemm…
Apr 12, 2018
0596d39
Added output in the case a user chooses a negative k for the delayed …
Apr 12, 2018
ba41772
Replaced second smw_update call with small addon to update_onemove ke…
Apr 17, 2018
145bee1
Merge remote-tracking branch 'upstream/develop' into delayed_updates
Dec 10, 2018
b5cf582
Minor bugfixes caused by incomplete merge with new/renamed files (AoS…
Dec 12, 2018
9349e94
Added DMC functionality.
Dec 14, 2018
d6368ec
Slight performance update for GPU delayed updates code.
Dec 20, 2018
20892cc
Fix the compilation of GPU real.
ye-luo Dec 21, 2018
43a6560
Fixed hamiltonian unit test failure.
Jan 2, 2019
245438b
First implementation of complex GPU delayed updates code path.
Jan 3, 2019
1757bf2
Bugfix for complex code path. Things are working now.
Jan 3, 2019
0cb4d5f
Fix compile issues for slightly older CUDA version (i.e. on Rhea).
Jan 4, 2019
d5e984d
Minor compile fix.
Jan 4, 2019
e82c929
Moved delayed update routines into QMCWaveFunctions/Fermion/delayed_u…
Jan 7, 2019
d59cec9
Implemented pass-through of Ye's delay_rank parameter to my code path.
Jan 9, 2019
b39bd00
Compile fix.
Jan 9, 2019
ab6f5e3
Minor bugfix.
Jan 10, 2019
a9ff651
Revert to original cmake files and fix minor bug in DiracDeterminantC…
Jan 15, 2019
f849398
Merge branch 'develop' of https://github.com/QMCPACK/qmcpack into del…
Jan 15, 2019
aefab61
Fix merge issue which caused tests to fail.
Jan 15, 2019
29c9a0c
Fix drift case when delay rank not an integer fraction of electron nu…
Jan 18, 2019
3cbd58a
Code should be feature complete now as the last TODO (nat/2 use in MC…
Jan 18, 2019
17b7bdc
Removed resolved TODO comments.
Jan 18, 2019
c68fe6a
Cleaned up code and fixed bug of sometimes not properly initialized l…
Jan 24, 2019
5f279c4
Reverted changed files in utils directory back to original versions.
Jan 24, 2019
a5c369a
Fix complex compile bug.
Jan 24, 2019
49b908b
Added pivot table to cublas inverse to improve delayed update stabili…
Jan 31, 2019
13cf03e
Fixed numerical instability and improved performance for VMC w/o drift.
Feb 4, 2019
9f21534
Added missing synchronization before lemma matrix calculation.
Feb 5, 2019
004c09f
Added warning to cap GPU delayed updates at k = 64 for VMC w/ drift d…
Feb 5, 2019
7bda6a2
Fixed VMC w/o drift error for larger delay ranks. Removed cap and war…
Feb 6, 2019
55d6e47
Merge branch 'develop' into delayed_updates
ye-luo Feb 7, 2019
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 39 additions & 0 deletions src/CUDA/gpu_vector.h
Original file line number Diff line number Diff line change
Expand Up @@ -360,6 +360,45 @@ class device_vector
#endif
}

void
asyncCopy(const T* vec_ptr, size_t len, size_t offset, size_t datalen)
{
if ((this->size() != len) || (this->size() < offset+datalen))
{
if (!own_data)
{
fprintf (stderr, "Assigning referenced GPU vector, but it has the "
"wrong size.\n");
fprintf (stderr, "Name = %s. This size = %ld, vec size = %ld\n",
name.c_str(), size(), len);
abort();
}
if (len<offset+datalen)
{
fprintf(stderr, "Trying to write more than the length of the vector.\n");
fprintf (stderr, "Name = %s. This size = %ld, vec size = %ld, needed length = %ld\n",
name.c_str(), size(), len, offset+datalen);
abort();
}
resize(len);
}
#ifdef QMC_CUDA
cudaMemcpyAsync (&((*this)[offset]), vec_ptr, datalen*sizeof(T),
cudaMemcpyHostToDevice, kernelStream);
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess)
{
fprintf (stderr, "In operator=, name=%s, size=%ld vec.size()=%ld\n",
name.c_str(), size(), len);
fprintf (stderr, "this pointer = %p vec pointer=%p\n",
data_pointer, vec_ptr);
fprintf (stderr, "CUDA error in device_vector::asyncCopy(const T* vec_ptr, len, offset, datalen) for %s:\n %s\n",
name.c_str(), cudaGetErrorString(err));
abort();
}
#endif
}

void
asyncCopy(const std::vector<T, std::allocator<T> > &vec)
{
Expand Down
2 changes: 0 additions & 2 deletions src/Numerics/CUDA/cuda_inverse.cu
Original file line number Diff line number Diff line change
Expand Up @@ -144,7 +144,6 @@ convert_complex (Tdest **dest_list, Tsrc **src_list, int len)
}
}


// Four matrix inversion functions
// 1. for float matrices
// useHigherPrecision = false --> single precision operations
Expand Down Expand Up @@ -246,7 +245,6 @@ cublas_inverse (cublasHandle_t handle,
cudaDeviceSynchronize();
}


// 3. for complex float matrices
// useHigherPrecision = false --> single precision operations
// useHigherPrecision = true --> double precision operations (default)
Expand Down
1 change: 0 additions & 1 deletion src/Numerics/CUDA/cuda_inverse.h
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,6 @@ cublas_inverse (cublasHandle_t handle,
int N, int rowStride, int numMats,
bool useHigherPrecision = true);


//////////////////////////////////////
// Complex single / mixed precision //
//////////////////////////////////////
Expand Down
54 changes: 40 additions & 14 deletions src/Particle/MCWalkerConfiguration.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -555,14 +555,14 @@ void MCWalkerConfiguration::updateLists_GPU()
{
int nw = WalkerList.size();
int NumSpecies = getSpeciesSet().TotalNum;
if (Rnew_GPU.size() != nw)
if (Rnew_GPU.size() != nw*kblocksize)
{
Rnew_GPU.resize(nw);
Rnew_GPU.resize(nw*kblocksize);
RhokLists_GPU.resize(NumSpecies);
for (int isp=0; isp<NumSpecies; isp++)
RhokLists_GPU[isp].resize(nw);
Rnew_host.resize(nw);
Rnew.resize(nw);
Rnew_host.resize(nw*kblocksize);
Rnew.resize(nw*kblocksize);
AcceptList_GPU.resize(nw);
AcceptList_host.resize(nw);
RList_GPU.resize(nw);
Expand Down Expand Up @@ -656,18 +656,44 @@ void MCWalkerConfiguration::copyWalkerGradToGPU()
void MCWalkerConfiguration::proposeMove_GPU
(std::vector<PosType> &newPos, int iat)
{
if (Rnew_host.size() < newPos.size())
Rnew_host.resize(newPos.size());
for (int i=0; i<newPos.size(); i++)
int nw=newPos.size();
if (Rnew_host.size() < nw*kblocksize)
{
Rnew.resize(nw*kblocksize);
Rnew_host.resize(nw*kblocksize);
}
// store things sequentially with k to make evaluation more straight-forward:
// k=0 k=1 k=kblocksize
// Rnew = [0,..,nw|0,..,nw|...|0,..,nw]
int offset=kcurr*nw;
for (int i=0; i<nw; i++)
{
for (int dim=0; dim<OHMMS_DIM; dim++)
Rnew_host[i][dim] = newPos[i][dim];
Rnew_GPU.asyncCopy(Rnew_host);
Rnew = newPos;
{
Rnew[i+offset][dim] = newPos[i][dim];
Rnew_host[i+offset][dim] = newPos[i][dim];
}
}
if(kDelay)
{
kcurr=(kcurr+1)%kblocksize; // loop kcurr around every k blocks
kstart=kblock*kblocksize;
if(kcurr==0)
kblock++; // keep increasing kblock (even beyond available matrix blocks) - the update check takes care of self-consistency
// only copy new position matrix when needed (when update is imminent)
if(klinear)
{
Rnew_GPU.asyncCopy(&(Rnew_host[offset]),nw*kblocksize,offset,nw);
} else
if(kcurr==0 || (kcurr+kblock*kblocksize>=getnat(iat)))
Rnew_GPU.asyncCopy(Rnew_host);
} else
Rnew_GPU.asyncCopy(Rnew_host);
CurrentParticle = iat;
}


void MCWalkerConfiguration::acceptMove_GPU(std::vector<bool> &toAccept)
void MCWalkerConfiguration::acceptMove_GPU(std::vector<bool> &toAccept, int k)
{
if (AcceptList_host.size() < toAccept.size())
AcceptList_host.resize(toAccept.size());
Expand All @@ -682,13 +708,13 @@ void MCWalkerConfiguration::acceptMove_GPU(std::vector<bool> &toAccept)
// app_log() << "RList_GPU.size() = " << RList_GPU.size() << std::endl;
if (RList_GPU.size() != WalkerList.size())
std::cerr << "Error in RList_GPU size.\n";
if (Rnew_GPU.size() != WalkerList.size())
if (Rnew_GPU.size() != WalkerList.size()*kblocksize)
std::cerr << "Error in Rnew_GPU size.\n";
if (AcceptList_GPU.size() != WalkerList.size())
std::cerr << "Error in AcceptList_GPU_GPU size.\n";
std::cerr << "Error in AcceptList_GPU size.\n";
accept_move_GPU_cuda
(RList_GPU.data(), (CUDA_PRECISION*)Rnew_GPU.data(),
AcceptList_GPU.data(), CurrentParticle, WalkerList.size());
AcceptList_GPU.data(), CurrentParticle++, WalkerList.size(), k);
}


Expand Down
121 changes: 118 additions & 3 deletions src/Particle/MCWalkerConfiguration.h
Original file line number Diff line number Diff line change
Expand Up @@ -123,10 +123,13 @@ class MCWalkerConfiguration: public ParticleSet
int CurrentParticle;
GPU_XRAY_TRACE void proposeMove_GPU
(std::vector<PosType> &newPos, int iat);
GPU_XRAY_TRACE void acceptMove_GPU(std::vector<bool> &toAccept);
GPU_XRAY_TRACE void acceptMove_GPU
(std::vector<bool> &toAccept, int k);
GPU_XRAY_TRACE void acceptMove_GPU
(std::vector<bool> &toAccept){ acceptMove_GPU(toAccept,0); }
GPU_XRAY_TRACE void NLMove_GPU (std::vector<Walker_t*> &walkers,
std::vector<PosType> &Rnew,
std::vector<int> &iat);
std::vector<PosType> &Rnew,
std::vector<int> &iat);
#endif

///default constructor
Expand Down Expand Up @@ -379,6 +382,102 @@ class MCWalkerConfiguration: public ParticleSet
}
}

#ifdef QMC_CUDA
inline void setklinear()
{
klinear=true;
}

inline bool getklinear()
{
return klinear;
}

inline void setkDelay(int k)
{
klinear=false;
kDelay=k;
if (kDelay<0)
{
app_log() << " Warning: Specified negative delayed updates k = " << k << ", setting to zero (no delay)." << std::endl;
kDelay=0;
}
if (kDelay==1)
kDelay=0; // use old algorithm as additional overhead for k=1 is not doing anything useful outside of code development
kblocksize=1;
kblock=0;
kcurr=0;
kstart=0;
if(kDelay)
{
app_log() << " Using delayed updates (k = " << kDelay << ") for all walkers" << std::endl;
kblocksize=kDelay;
}
kupdate=kblocksize;
}

inline int getkDelay()
{
return kDelay;
}

inline int getkblock()
{
return kblock;
}

inline int getkblocksize()
{
return kblocksize;
}

inline int getkcurr()
{
return kcurr;
}

inline int getkstart()
{
return kstart;
}

inline int getkupdate()
{
return kupdate;
}

inline int getnat(int iat)
{
for(unsigned int gid=0; gid<groups(); gid++)
if(last(gid)>iat)
return last(gid)-first(gid);
}

inline bool update_now(int iat)
{
// in case that we finished the current k-block (kcurr=0) *or* (<- This case also takes care of no delayed updates as kcurr will always be zero then)
// if we run out of electrons (nat) but still have some k's in the current k-block, an update needs to happen now
bool end_of_matrix = (kcurr+kblock*kblocksize>=getnat(iat));
bool update=((!kcurr) || end_of_matrix);
kupdate=kblocksize;
if(update)
{
if(kblock>0)
{
kstart=kblock*kblocksize;
if(kcurr==0) kstart-=kblocksize; // means we looped cleanly within kblocksize matrix (and kblock is too large by 1), hence start is at (kblock-1)*kblocksize
kupdate=kcurr+kblock*kblocksize-kstart;
kcurr=0;
if(!klinear) CurrentParticle-=kupdate-1;
}
}
// reset kblock if we're out of matrix blocks
if(end_of_matrix)
kblock=0;
return update;
}
#endif

protected:

///boolean for cleanup
Expand All @@ -391,6 +490,22 @@ class MCWalkerConfiguration: public ParticleSet
int GlobalNumWalkers;
///update-mode index
int UpdateMode;
#ifdef QMC_CUDA
///delayed update streak parameter k
int kDelay;
///block dimension (usually k) in case delayed updates are used (there are nat/kblocksize blocks available)
int kblocksize=1;
///current block
int kblock;
///current k within current block
int kcurr;
///current k to start from update
int kstart;
///number of columns to update
int kupdate;
///klinear switch to indicate if values are calculated sequentially for algorithms using drift
bool klinear;
#endif

RealType LocalEnergy;

Expand Down
5 changes: 5 additions & 0 deletions src/Particle/ParticleSet.h
Original file line number Diff line number Diff line change
Expand Up @@ -624,6 +624,11 @@ class ParticleSet
{
return SubPtcl[igroup+1];
}

inline int groupsize(int igroup) const
{
return SubPtcl[igroup+1]-SubPtcl[igroup];
}

///add attributes to list for IO
template<typename ATList>
Expand Down
12 changes: 6 additions & 6 deletions src/Particle/accept_kernel.cu
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@

template<typename T, int BS>
__global__ void accept_kernel (T** Rlist, T* Rnew,
int* toAccept, int iat, int N)
int* toAccept, int iat, int N, int k)
{
int tid = threadIdx.x;
__shared__ T* myR[BS];
Expand All @@ -26,7 +26,7 @@ __global__ void accept_kernel (T** Rlist, T* Rnew,
int block = blockIdx.x;
for (int i=0; i<3; i++)
if ((3*block+i)*BS + tid < 3*N)
Rnew_shared[i*BS + tid] = Rnew[(3*block+i)*BS + tid];
Rnew_shared[i*BS + tid] = Rnew[(3*block+i)*BS + tid + 3*k*N];
__syncthreads();
if (block*BS + tid < N)
{
Expand Down Expand Up @@ -55,27 +55,27 @@ __global__ void accept_kernel (T** Rlist, T* Rnew,

void
accept_move_GPU_cuda (float** Rlist, float new_pos[],
int toAccept[], int iat, int N)
int toAccept[], int iat, int N, int k)
{
const int BS=32;
int NB = N / BS + ((N % BS) ? 1 : 0);
dim3 dimBlock(BS);
dim3 dimGrid(NB);
accept_kernel<float,BS><<<dimGrid,dimBlock, 0, gpu::kernelStream>>>
(Rlist, new_pos, toAccept, iat, N);
(Rlist, new_pos, toAccept, iat, N, k);
}


void
accept_move_GPU_cuda (double** Rlist, double* new_pos,
int* toAccept, int iat, int N)
int* toAccept, int iat, int N, int k)
{
const int BS=32;
int NB = N / BS + ((N % BS) ? 1 : 0);
dim3 dimBlock(BS);
dim3 dimGrid(NB);
accept_kernel<double,BS><<<dimGrid,dimBlock, 0, gpu::kernelStream>>>
(Rlist, new_pos, toAccept, iat, N);
(Rlist, new_pos, toAccept, iat, N, k);
}


Expand Down
4 changes: 2 additions & 2 deletions src/Particle/accept_kernel.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,10 +19,10 @@

void
accept_move_GPU_cuda (float* Rlist[], float new_pos[],
int toAccept[], int iat, int N);
int toAccept[], int iat, int N, int k);
void
accept_move_GPU_cuda (double* Rlist[], double new_pos[],
int toAccept[], int iat, int N);
int toAccept[], int iat, int N, int k);

void NL_move_cuda ( float* Rlist[], float new_pos[], int N);
void NL_move_cuda (double* Rlist[], double new_pos[], int N);
Expand Down
Loading