Skip to content

Commit

Permalink
remove some routine version inconsistencies
Browse files Browse the repository at this point in the history
  • Loading branch information
djm34 committed Jun 8, 2017
1 parent 877b7aa commit 171b862
Showing 1 changed file with 1 addition and 71 deletions.
72 changes: 1 addition & 71 deletions lyra2/cuda_lyra2Z_sm5.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ __device__ __forceinline__ void ST4S(const int index, const uint2 data)
shared_mem[(index * blockDim.y + threadIdx.y) * blockDim.x + threadIdx.x] = data;
}

#if __CUDA_ARCH__ == 300

__device__ __forceinline__ uint32_t WarpShuffle(uint32_t a, uint32_t b, uint32_t c)
{
return __shfl(a, b, c);
Expand All @@ -67,76 +67,6 @@ __device__ __forceinline__ void WarpShuffle3(uint2 &a1, uint2 &a2, uint2 &a3, ui
a3 = WarpShuffle(a3, b3, c);
}

#else
__device__ __forceinline__ uint32_t WarpShuffle(uint32_t a, uint32_t b, uint32_t c)
{
extern __shared__ uint2 shared_mem[];

const uint32_t thread = blockDim.x * threadIdx.y + threadIdx.x;
uint32_t *_ptr = (uint32_t*)shared_mem;

__threadfence_block();
uint32_t buf = _ptr[thread];

_ptr[thread] = a;
__threadfence_block();
uint32_t result = _ptr[(thread&~(c - 1)) + (b&(c - 1))];

__threadfence_block();
_ptr[thread] = buf;

__threadfence_block();
return result;
}

__device__ __forceinline__ uint2 WarpShuffle(uint2 a, uint32_t b, uint32_t c)
{
extern __shared__ uint2 shared_mem[];

const uint32_t thread = blockDim.x * threadIdx.y + threadIdx.x;

__threadfence_block();
uint2 buf = shared_mem[thread];

shared_mem[thread] = a;
__threadfence_block();
uint2 result = shared_mem[(thread&~(c - 1)) + (b&(c - 1))];

__threadfence_block();
shared_mem[thread] = buf;

__threadfence_block();
return result;
}

__device__ __forceinline__ void WarpShuffle3(uint2 &a1, uint2 &a2, uint2 &a3, uint32_t b1, uint32_t b2, uint32_t b3, uint32_t c)
{
extern __shared__ uint2 shared_mem[];

const uint32_t thread = blockDim.x * threadIdx.y + threadIdx.x;

__threadfence_block();
uint2 buf = shared_mem[thread];

shared_mem[thread] = a1;
__threadfence_block();
a1 = shared_mem[(thread&~(c - 1)) + (b1&(c - 1))];
__threadfence_block();
shared_mem[thread] = a2;
__threadfence_block();
a2 = shared_mem[(thread&~(c - 1)) + (b2&(c - 1))];
__threadfence_block();
shared_mem[thread] = a3;
__threadfence_block();
a3 = shared_mem[(thread&~(c - 1)) + (b3&(c - 1))];

__threadfence_block();
shared_mem[thread] = buf;
__threadfence_block();
}

#endif



__device__ __forceinline__ void round_lyra(uint2 s[4])
Expand Down

0 comments on commit 171b862

Please sign in to comment.