Skip to content

Commit

Permalink
change CUDA implementation of randperm OP
Browse files Browse the repository at this point in the history
  • Loading branch information
zhwesky2010 committed Mar 14, 2022
1 parent 329b095 commit 7be4b95
Show file tree
Hide file tree
Showing 2 changed files with 219 additions and 18 deletions.
160 changes: 142 additions & 18 deletions paddle/phi/kernels/gpu/randperm_kernel.cu
Original file line number Diff line number Diff line change
Expand Up @@ -14,37 +14,161 @@

#include "paddle/phi/kernels/randperm_kernel.h"

#ifdef __NVCC__
#include <curand_kernel.h>
#include "cub/cub.cuh"
#endif
#ifdef __HIPCC__
#include <hiprand_kernel.h>
#include <hipcub/hipcub.hpp>
namespace cub = hipcub;
#endif
#include "paddle/phi/backends/gpu/gpu_launch_config.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/empty_kernel.h"
#include "paddle/phi/kernels/funcs/for_range.h"
#include "paddle/phi/kernels/randint_kernel.h"

// See Note [ Why still include the fluid headers? ]
#include "paddle/fluid/memory/memcpy.h"

DECLARE_bool(use_curand);

namespace phi {

template <typename T>
__global__ void SwapRepeatKernel(
int* key, T* data, int n, uint64_t seed, uint64_t offset) {
size_t idx = static_cast<size_t>(blockIdx.x * blockDim.x + threadIdx.x);
if (idx < n) return;

bool first_repeat = false;
if (data[idx] == data[idx + 1]) {
if (idx == 0) {
first_repeat = true;
} else if (data[idx] != data[idx - 1]) {
first_repeat = true;
}
}

if (!first_repeat) return;

int repeat_size = 1;
for (int i = idx; i < n; ++i) {
if (data[i] == data[i + 1]) {
++repeat_size;
} else {
break;
}
}

#ifdef __NVCC__
curandStatePhilox4_32_10_t state;
curand_init(seed, idx, offset, &state);
for (int i = repeat_size - 1; i > 0; i--) {
uint32_t r = curand(&state) % (i + 1);
#elif __HIPCC__
hiprandStatePhilox4_32_10_t state;
hiprand_init(seed, idx, offset, &state);
for (int i = repeat_size - 1; i > 0; i--) {
uint32_t r = hiprand(&state) % (i + 1);
#endif
if (r != i) {
T tmp = data[idx + i];
data[idx + i] = data[idx + r];
data[idx + r] = tmp;
}
}
}

template <typename T, typename Context>
void RandpermRawKernel(
const Context& dev_ctx, int n, DataType dtype, int seed, DenseTensor* out) {
DenseTensor tmp;
tmp.Resize(phi::make_ddim({n}));
T* tmp_data = dev_ctx.template HostAlloc<T>(&tmp);

std::shared_ptr<std::mt19937_64> engine;
if (seed) {
engine = std::make_shared<std::mt19937_64>();
engine->seed(seed);
if (FLAGS_use_curand) {
DenseTensor key;
RandintKernel<int, Context>(dev_ctx,
std::numeric_limits<int>::min(),
std::numeric_limits<int>::max(),
ScalarArray({n}),
phi::DataType::INT32,
&key);
DenseTensor key_out = Empty<int, Context>(dev_ctx, ScalarArray({n}));

DenseTensor range = Empty<T, Context>(dev_ctx, ScalarArray({n}));
T* range_data = range.data<T>();
funcs::ForRange<Context> for_range(dev_ctx, n);
for_range([range_data] __device__(size_t idx) {
range_data[idx] = static_cast<T>(idx);
});

out->Resize(phi::make_ddim({n}));
T* out_data = dev_ctx.template Alloc<T>(out);

// Refer to [Algorithm of randperm] https://osf.io/af2hy/ to
// improve performance of radix sort.
double n_d = static_cast<double>(n);
int begin_bit = 0;
int end_bit =
std::ceil(std::log2(n_d - (6 * n_d * n_d + 1) / (12 * std::log(0.9))));

size_t temp_storage_bytes = 0;
cub::DeviceRadixSort::SortPairs<int, T>(nullptr,
temp_storage_bytes,
key.data<int>(),
key_out.data<int>(),
range.data<T>(),
out_data,
n,
begin_bit,
end_bit < 32 ? end_bit : 32,
dev_ctx.stream());

auto d_temp_storage = paddle::memory::Alloc(dev_ctx, temp_storage_bytes);
cub::DeviceRadixSort::SortPairs<int, T>(d_temp_storage->ptr(),
temp_storage_bytes,
key.data<int>(),
key_out.data<int>(),
range.data<T>(),
out_data,
n,
begin_bit,
end_bit < 32 ? end_bit : 32,
dev_ctx.stream());

auto gen_cuda = dev_ctx.GetGenerator();
auto seed_offset = gen_cuda->IncrementOffset(n);
uint64_t seed = seed_offset.first;
uint64_t offset = seed_offset.second;

auto config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, n);
SwapRepeatKernel<T><<<config.block_per_grid.x,
config.thread_per_block.x,
0,
dev_ctx.stream()>>>(
key_out.data<int>(), out_data, n, seed, offset);
} else {
engine = dev_ctx.GetHostGenerator()->GetCPUEngine();
}
DenseTensor tmp;
tmp.Resize(phi::make_ddim({n}));
T* tmp_data = dev_ctx.template HostAlloc<T>(&tmp);

for (int i = 0; i < n; ++i) {
tmp_data[i] = static_cast<T>(i);
}
std::shuffle(tmp_data, tmp_data + n, *engine);
std::shared_ptr<std::mt19937_64> engine;
if (seed) {
engine = std::make_shared<std::mt19937_64>();
engine->seed(seed);
} else {
engine = dev_ctx.GetHostGenerator()->GetCPUEngine();
}

T* out_data = dev_ctx.template Alloc<T>(out);
auto size = out->numel() * paddle::experimental::SizeOf(out->dtype());
paddle::memory::Copy<phi::GPUPlace, phi::Place>(
out->place(), out_data, tmp.place(), tmp_data, size, 0);
for (int i = 0; i < n; ++i) {
tmp_data[i] = static_cast<T>(i);
}
std::shuffle(tmp_data, tmp_data + n, *engine);

T* out_data = dev_ctx.template Alloc<T>(out);
auto size = out->numel() * paddle::experimental::SizeOf(out->dtype());
paddle::memory::Copy<phi::GPUPlace, phi::Place>(
out->place(), out_data, tmp.place(), tmp_data, size, 0);
}
}

template <typename T, typename Context>
Expand Down
77 changes: 77 additions & 0 deletions python/paddle/fluid/tests/unittests/test_randperm_op.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
import paddle
import paddle.fluid.core as core
from paddle.static import program_guard, Program
import os


def check_randperm_out(n, data_np):
Expand Down Expand Up @@ -129,5 +130,81 @@ def test_out(self):
paddle.enable_static()


class TestRandomValue(unittest.TestCase):
def test_fixed_random_number(self):
# Test GPU Fixed random number, which is generated by 'curandStatePhilox4_32_10_t'
if not paddle.is_compiled_with_cuda():
return

if os.getenv("FLAGS_use_curand", None) in ('0', 'False', None):
return

print("Test Fixed Random number on GPU------>")
paddle.disable_static()
paddle.set_device('gpu')
paddle.seed(2021)

x = paddle.randperm(30000, dtype='int32').numpy()
expect = [
24562, 8409, 9379, 10328, 20503, 18059, 9681, 21883, 11783, 27413
]
self.assertTrue(np.array_equal(x[0:10], expect))
expect = [
29477, 27100, 9643, 16637, 8605, 16892, 27767, 2724, 1612, 13096
]
self.assertTrue(np.array_equal(x[10000:10010], expect))
expect = [
298, 4104, 16479, 22714, 28684, 7510, 14667, 9950, 15940, 28343
]
self.assertTrue(np.array_equal(x[20000:20010], expect))

x = paddle.randperm(30000, dtype='int64').numpy()
expect = [
6587, 1909, 5525, 23001, 6488, 14981, 14355, 3083, 29561, 8171
]
self.assertTrue(np.array_equal(x[0:10], expect))
expect = [
23460, 12394, 22501, 5427, 20185, 9100, 5127, 1651, 25806, 4818
]
self.assertTrue(np.array_equal(x[10000:10010], expect))
expect = [5829, 4508, 16193, 24836, 8526, 242, 9984, 9243, 1977, 11839]
self.assertTrue(np.array_equal(x[20000:20010], expect))

x = paddle.randperm(30000, dtype='float32').numpy()
expect = [
5154., 10537., 14362., 29843., 27185., 28399., 27561., 4144.,
22906., 10705.
]
self.assertTrue(np.array_equal(x[0:10], expect))
expect = [
1958., 18414., 20090., 21910., 22746., 27346., 22347., 3002., 4564.,
26991.
]
self.assertTrue(np.array_equal(x[10000:10010], expect))
expect = [
25580., 12606., 553., 16387., 29536., 4241., 20946., 16899., 16339.,
4662.
]
self.assertTrue(np.array_equal(x[20000:20010], expect))

x = paddle.randperm(30000, dtype='float64').numpy()
expect = [
19051., 2449., 21940., 11121., 282., 7330., 13747., 24321., 21147.,
9163.
]
self.assertTrue(np.array_equal(x[0:10], expect))
expect = [
15483., 1315., 5723., 20954., 13251., 25539., 5074., 1823., 14945.,
17624.
]
self.assertTrue(np.array_equal(x[10000:10010], expect))
expect = [
10516., 2552., 29970., 5941., 986., 8007., 24805., 26753., 12202.,
21404.
]
self.assertTrue(np.array_equal(x[20000:20010], expect))
paddle.enable_static()


if __name__ == "__main__":
unittest.main()

0 comments on commit 7be4b95

Please sign in to comment.