From 0efec911129404db604f06f87d2248a2dbc411b8 Mon Sep 17 00:00:00 2001 From: Peter Eastman Date: Fri, 2 Oct 2020 10:41:05 -0700 Subject: [PATCH] Optimizations to CudaANISymmetryFunctions --- ani/CudaANISymmetryFunctions.cu | 82 +++++++++++++++++++-------------- 1 file changed, 47 insertions(+), 35 deletions(-) diff --git a/ani/CudaANISymmetryFunctions.cu b/ani/CudaANISymmetryFunctions.cu index 26eb92b..d62f265 100644 --- a/ani/CudaANISymmetryFunctions.cu +++ b/ani/CudaANISymmetryFunctions.cu @@ -233,23 +233,20 @@ template __global__ void computeAngularFunctions(int numAtoms, int numSpecies, int numAngular, float angularCutoff, float* angular, int* neighbors, int* neighborCount, const float3* positions, const float* periodicBoxVectors, const AngularFunction* angularFunctions, const int* atomSpecies, const int* angularIndex) { - const int warp = (blockIdx.x*blockDim.x+threadIdx.x)/32; - const int indexInWarp = threadIdx.x%32; - const int numWarps = (gridDim.x*blockDim.x)/32; const float3 invBoxSize = (PERIODIC ? make_float3(1/periodicBoxVectors[0], 1/periodicBoxVectors[4], 1/periodicBoxVectors[8]) : make_float3(0, 0, 0)); const int c1 = numAngular; const int c2 = numSpecies*(numSpecies+1)*c1/2; - // Each warp loops over atoms. + // Each thread block loops over atoms. - for (int atom1 = warp; atom1 < numAtoms; atom1 += numWarps) { + for (int atom1 = blockIdx.x; atom1 < numAtoms; atom1 += gridDim.x) { float3 pos1 = positions[atom1]; int numNeighbors = neighborCount[atom1]; - // The threads in the warp loop over pairs of atoms from the neighbor list. + // The threads in the block loop over pairs of atoms from the neighbor list. int numPairs = numNeighbors*(numNeighbors-1)/2; - for (int pair = indexInWarp; pair < numPairs; pair += 32) { + for (int pair = threadIdx.x; pair < numPairs; pair += blockDim.x) { int i = (int) floorf(numNeighbors-0.5f-sqrtf((numNeighbors-0.5f)*(numNeighbors-0.5f)-2*pair)); int j = pair - i*numNeighbors + (i+1)*(i+2)/2; int atom2 = neighbors[atom1*numAtoms + i]; @@ -300,8 +297,22 @@ void CudaANISymmetryFunctions::computeSymmetryFunctions(const float* positions, // Record the positions and periodic box vectors. CHECK_RESULT(cudaMemcpyAsync(this->positions, positions, 3*numAtoms*sizeof(float), cudaMemcpyDefault)); - if (periodic) - CHECK_RESULT(cudaMemcpyAsync(this->periodicBoxVectors, periodicBoxVectors, 9*sizeof(float), cudaMemcpyDefault)); + float* hostBoxVectors; + if (periodic) { + // We'll need to access the box vectors on both host and device. Figure out the most + // efficient way of doing that. + + cudaPointerAttributes attrib; + cudaError_t result = cudaPointerGetAttributes(&attrib, periodicBoxVectors); + if (result != cudaSuccess || attrib.hostPointer == 0) { + CHECK_RESULT(cudaMemcpy(this->periodicBoxVectors, periodicBoxVectors, 9*sizeof(float), cudaMemcpyDefault)); + hostBoxVectors = this->periodicBoxVectors; + } + else { + CHECK_RESULT(cudaMemcpyAsync(this->periodicBoxVectors, periodicBoxVectors, 9*sizeof(float), cudaMemcpyDefault)); + hostBoxVectors = (float*) attrib.hostPointer; + } + } // Determine whether we have a rectangular or triclinic periodic box. @@ -309,7 +320,7 @@ void CudaANISymmetryFunctions::computeSymmetryFunctions(const float* positions, if (periodic) for (int i = 0 ; i < 3; i++) for (int j = 0; j < 3; j++) - if (i != j && periodicBoxVectors[3*i+j] != 0) + if (i != j && hostBoxVectors[3*i+j] != 0) triclinic = true; // Clear the output arrays. @@ -320,7 +331,7 @@ void CudaANISymmetryFunctions::computeSymmetryFunctions(const float* positions, // Compute the symmetry functions. int blockSize = 128; - int numBlocks = min(maxBlocks, (int) ceil(numAtoms/4.0)); + int numBlocks = min(maxBlocks, numAtoms); int numRadial = radialFunctions.size(); int numAngular = angularFunctions.size(); if (periodic) { @@ -362,24 +373,21 @@ __global__ void backpropRadialFunctions(int numAtoms, int numSpecies, int numRad float radialCutoff, const float* radialDeriv, float* positionDeriv, const float3* positions, const float* periodicBoxVectors, const RadialFunction* radialFunctions, const int* atomSpecies, bool torchani) { - const int warp = (blockIdx.x*blockDim.x+threadIdx.x)/32; - const int indexInWarp = threadIdx.x%32; - const int numWarps = (gridDim.x*blockDim.x)/32; const float3 invBoxSize = (PERIODIC ? make_float3(1/periodicBoxVectors[0], 1/periodicBoxVectors[4], 1/periodicBoxVectors[8]) : make_float3(0, 0, 0)); const int c1 = numRadial; const int c2 = numSpecies*c1; const float radialCutoff2 = radialCutoff*radialCutoff; const float globalScale = (torchani ? 0.25f : 1.0f); - // Each warp loops over atoms. + // Each thread block loops over atoms. - for (int atom1 = warp; atom1 < numAtoms; atom1 += numWarps) { + for (int atom1 = blockIdx.x; atom1 < numAtoms; atom1 += gridDim.x) { float3 pos1 = positions[atom1]; float3 posDeriv1 = make_float3(0, 0, 0); - // The threads in the warp loop over second atoms. + // The threads in the block loop over second atoms. - for (int atom2 = indexInWarp; atom2 < numAtoms; atom2 += 32) { + for (int atom2 = atom1+1+threadIdx.x; atom2 < numAtoms; atom2 += blockDim.x) { float3 pos2 = positions[atom2]; float3 delta; float r2; @@ -387,11 +395,12 @@ __global__ void backpropRadialFunctions(int numAtoms, int numSpecies, int numRad // Compute the derivatives of the symmetry functions. - if (r2 < radialCutoff2 && atom1 != atom2) { + if (r2 < radialCutoff2) { float r = sqrtf(r2); float rInv = 1/r; float cutoff = cutoffFunction(r, radialCutoff); float dCdR = cutoffDeriv(r, radialCutoff); + float3 posDeriv2 = make_float3(0, 0, 0); for (int i = 0; i < numRadial; i++) { const RadialFunction fn = radialFunctions[i]; float shifted = r-fn.rs; @@ -399,10 +408,16 @@ __global__ void backpropRadialFunctions(int numAtoms, int numSpecies, int numRad float dVdR = dCdR*expTerm - cutoff*2*fn.eta*shifted*expTerm; float dEdV = radialDeriv[atom1*c2 + atomSpecies[atom2]*c1 + i] + radialDeriv[atom2*c2 + atomSpecies[atom1]*c1 + i]; float scale = globalScale * dEdV * dVdR * rInv; - posDeriv1.x += scale*delta.x; - posDeriv1.y += scale*delta.y; - posDeriv1.z += scale*delta.z; + posDeriv1.x -= scale*delta.x; + posDeriv1.y -= scale*delta.y; + posDeriv1.z -= scale*delta.z; + posDeriv2.x += scale*delta.x; + posDeriv2.y += scale*delta.y; + posDeriv2.z += scale*delta.z; } + atomicAdd(&positionDeriv[3*atom2], posDeriv2.x); + atomicAdd(&positionDeriv[3*atom2+1], posDeriv2.y); + atomicAdd(&positionDeriv[3*atom2+2], posDeriv2.z); } } for (int offset = 16; offset > 0; offset /= 2) { @@ -410,10 +425,10 @@ __global__ void backpropRadialFunctions(int numAtoms, int numSpecies, int numRad posDeriv1.y += __shfl_down_sync(0xFFFFFFFF, posDeriv1.y, offset); posDeriv1.z += __shfl_down_sync(0xFFFFFFFF, posDeriv1.z, offset); } - if (indexInWarp == 0) { - positionDeriv[3*atom1] -= posDeriv1.x; - positionDeriv[3*atom1+1] -= posDeriv1.y; - positionDeriv[3*atom1+2] -= posDeriv1.z; + if (threadIdx.x%32 == 0) { + atomicAdd(&positionDeriv[3*atom1], posDeriv1.x); + atomicAdd(&positionDeriv[3*atom1+1], posDeriv1.y); + atomicAdd(&positionDeriv[3*atom1+2], posDeriv1.z); } } } @@ -423,24 +438,21 @@ __global__ void backpropAngularFunctions(int numAtoms, int numSpecies, int numAn const float* angularDeriv, float* positionDeriv, int* neighbors, int* neighborCount, const float3* positions, const float* periodicBoxVectors, const AngularFunction* angularFunctions, const int* atomSpecies, const int* angularIndex) { - const int warp = (blockIdx.x*blockDim.x+threadIdx.x)/32; - const int indexInWarp = threadIdx.x%32; - const int numWarps = (gridDim.x*blockDim.x)/32; const float3 invBoxSize = (PERIODIC ? make_float3(1/periodicBoxVectors[0], 1/periodicBoxVectors[4], 1/periodicBoxVectors[8]) : make_float3(0, 0, 0)); const int c1 = numAngular; const int c2 = numSpecies*(numSpecies+1)*c1/2; - // Each warp loops over atoms. + // Each thread block loops over atoms. - for (int atom1 = warp; atom1 < numAtoms; atom1 += numWarps) { + for (int atom1 = blockIdx.x; atom1 < numAtoms; atom1 += gridDim.x) { float3 pos1 = positions[atom1]; float3 posDeriv1 = make_float3(0, 0, 0); int numNeighbors = neighborCount[atom1]; - // The threads in the warp loop over pairs of atoms from the neighbor list. + // The threads in the block loop over pairs of atoms from the neighbor list. int numPairs = numNeighbors*(numNeighbors-1)/2; - for (int pair = indexInWarp; pair < numPairs; pair += 32) { + for (int pair = threadIdx.x; pair < numPairs; pair += blockDim.x) { int i = (int) floorf(numNeighbors-0.5f-sqrtf((numNeighbors-0.5f)*(numNeighbors-0.5f)-2*pair)); int j = pair - i*numNeighbors + (i+1)*(i+2)/2; int atom2 = neighbors[atom1*numAtoms + i]; @@ -538,7 +550,7 @@ __global__ void backpropAngularFunctions(int numAtoms, int numSpecies, int numAn posDeriv1.y += __shfl_down_sync(0xFFFFFFFF, posDeriv1.y, offset); posDeriv1.z += __shfl_down_sync(0xFFFFFFFF, posDeriv1.z, offset); } - if (indexInWarp == 0) { + if (threadIdx.x%32 == 0) { atomicAdd(&positionDeriv[3*atom1], posDeriv1.x); atomicAdd(&positionDeriv[3*atom1+1], posDeriv1.y); atomicAdd(&positionDeriv[3*atom1+2], posDeriv1.z); @@ -556,7 +568,7 @@ void CudaANISymmetryFunctions::backprop(const float* radialDeriv, const float* a // Backpropagate through the symmetry functions. int blockSize = 128; - int numBlocks = min(maxBlocks, (int) ceil(numAtoms/4.0)); + int numBlocks = min(maxBlocks, numAtoms); int numRadial = radialFunctions.size(); int numAngular = angularFunctions.size(); if (periodic) {