From 5e95c378ea990dcebb67774678f06ce5686dd2f1 Mon Sep 17 00:00:00 2001 From: zhangbo9674 Date: Wed, 16 Feb 2022 03:08:17 +0000 Subject: [PATCH 1/8] add elementwise_div --- .../elementwise/elementwise_div_op.cu | 6 +++ .../device/gpu/cuda/cuda_device_function.h | 27 ++++++++++++ paddle/pten/kernels/gpu/math_kernel.cu | 2 + .../unittests/test_elementwise_div_op.py | 41 ++++++++++++++++++- 4 files changed, 75 insertions(+), 1 deletion(-) diff --git a/paddle/fluid/operators/elementwise/elementwise_div_op.cu b/paddle/fluid/operators/elementwise/elementwise_div_op.cu index 06f9107db27b4..9eb4b0352e533 100644 --- a/paddle/fluid/operators/elementwise/elementwise_div_op.cu +++ b/paddle/fluid/operators/elementwise/elementwise_div_op.cu @@ -53,6 +53,8 @@ REGISTER_OP_CUDA_KERNEL( ops::ElementwiseDivKernel, ops::ElementwiseDivKernel, + ops::ElementwiseDivKernel, ops::ElementwiseDivKernel, ops::ElementwiseDivKernel, ops::ElementwiseDivKernel, @@ -65,6 +67,8 @@ REGISTER_OP_CUDA_KERNEL( ops::ElementwiseDivGradKernel, ops::ElementwiseDivGradKernel, + ops::ElementwiseDivGradKernel, ops::ElementwiseDivGradKernel, ops::ElementwiseDivGradKernel, ops::ElementwiseDivGradKernel, @@ -78,6 +82,8 @@ REGISTER_OP_CUDA_KERNEL( float>, ops::ElementwiseDivDoubleGradKernel, + ops::ElementwiseDivDoubleGradKernel, ops::ElementwiseDivDoubleGradKernel, ops::ElementwiseDivDoubleGradKernel(delta), width)); } +template <> +__forceinline__ __device__ bfloat16 CudaShuffleDownSync(unsigned mask, + bfloat16 val, int delta, + int width) { +#if defined(PADDLE_CUDA_BF16) + return bfloat16(__shfl_down_sync(mask, static_cast(val), + static_cast(delta), width)); +#else + PADDLE_ENFORCE( + false, "__shfl_down_sync with bfloat16 is not supported on cuda <= 11."); +#endif +} + template <> __forceinline__ __device__ paddle::platform::complex CudaShuffleDownSync( unsigned mask, paddle::platform::complex val, int delta, int width) { @@ -90,6 +105,18 @@ __forceinline__ __device__ float16 CudaShuffleXorSync(unsigned mask, return float16(__shfl_xor_sync(mask, val.to_half(), width)); } +template <> +__forceinline__ __device__ bfloat16 CudaShuffleXorSync(unsigned mask, + bfloat16 val, + int width) { +#if defined(PADDLE_CUDA_BF16) + return bfloat16(__shfl_xor_sync(mask, static_cast(val), width)); +#else + PADDLE_ENFORCE( + false, "__shfl_xor_sync with bfloat16 is not supported on cuda <= 11."); +#endif +} + template <> __forceinline__ __device__ paddle::platform::complex CudaShuffleXorSync( unsigned mask, paddle::platform::complex val, int width) { diff --git a/paddle/pten/kernels/gpu/math_kernel.cu b/paddle/pten/kernels/gpu/math_kernel.cu index 1a549087e4221..95e6645902647 100644 --- a/paddle/pten/kernels/gpu/math_kernel.cu +++ b/paddle/pten/kernels/gpu/math_kernel.cu @@ -92,6 +92,7 @@ DEFINE_CUDA_ELEMENTWISE_OP(Divide) } // namespace pten using float16 = paddle::platform::float16; +using bfloat16 = paddle::platform::bfloat16; using complex64 = ::paddle::platform::complex; using complex128 = ::paddle::platform::complex; @@ -126,6 +127,7 @@ PT_REGISTER_KERNEL(divide_raw, int, int64_t, float16, + bfloat16, complex64, complex128) {} PT_REGISTER_KERNEL(multiply_raw, diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_div_op.py b/python/paddle/fluid/tests/unittests/test_elementwise_div_op.py index 32860a6694a89..7e1e327cc07db 100644 --- a/python/paddle/fluid/tests/unittests/test_elementwise_div_op.py +++ b/python/paddle/fluid/tests/unittests/test_elementwise_div_op.py @@ -18,7 +18,7 @@ import paddle import paddle.fluid as fluid import paddle.fluid.core as core -from op_test import OpTest, skip_check_grad_ci +from op_test import OpTest, skip_check_grad_ci, convert_float_to_uint16 class ElementwiseDivOp(OpTest): @@ -55,6 +55,45 @@ def init_dtype(self): pass +@unittest.skipIf( + not core.is_compiled_with_cuda() or core.cudnn_version() < 8100, + "core is not compiled with CUDA and cudnn version need larger than 8.1.0") +class TestElementwiseDivOpBF16(OpTest): + def setUp(self): + self.op_type = "elementwise_div" + self.dtype = np.uint16 + + x = np.random.uniform(0.1, 1, [12, 13]).astype(np.float32) + y = np.random.uniform(0.1, 1, [12, 13]).astype(np.float32) + + out = np.divide(x, y) + + self.inputs = { + 'X': convert_float_to_uint16(x), + 'Y': convert_float_to_uint16(y) + } + self.outputs = {'Out': convert_float_to_uint16(out)} + + def test_check_output(self): + place = core.CUDAPlace(0) + self.check_output_with_place(place, atol=1e-1) + + def test_check_grad_normal(self): + place = core.CUDAPlace(0) + self.check_grad_with_place( + place, ['X', 'Y'], 'Out', max_relative_error=1) + + def test_check_grad_ingore_x(self): + place = core.CUDAPlace(0) + self.check_grad_with_place( + place, ['Y'], 'Out', max_relative_error=1, no_grad_set=set("X")) + + def test_check_grad_ingore_y(self): + place = core.CUDAPlace(0) + self.check_grad_with_place( + place, ['X'], 'Out', max_relative_error=1, no_grad_set=set('Y')) + + @skip_check_grad_ci( reason="[skip shape check] Use y_shape(1) to test broadcast.") class TestElementwiseDivOp_scalar(ElementwiseDivOp): From 7683b0ff12770b9a75c2092e123b6df28741ea33 Mon Sep 17 00:00:00 2001 From: zhangbo9674 Date: Thu, 17 Feb 2022 09:22:55 +0000 Subject: [PATCH 2/8] refine rocm --- .../elementwise/elementwise_div_op.cu | 80 ++++++++----------- 1 file changed, 34 insertions(+), 46 deletions(-) diff --git a/paddle/fluid/operators/elementwise/elementwise_div_op.cu b/paddle/fluid/operators/elementwise/elementwise_div_op.cu index 9eb4b0352e533..04ccd022bb89c 100644 --- a/paddle/fluid/operators/elementwise/elementwise_div_op.cu +++ b/paddle/fluid/operators/elementwise/elementwise_div_op.cu @@ -48,49 +48,37 @@ ElementwiseDivGrad(const framework::ExecutionContext& ctx, } // namespace operators } // namespace paddle -REGISTER_OP_CUDA_KERNEL( - elementwise_div, - ops::ElementwiseDivKernel, - ops::ElementwiseDivKernel, - ops::ElementwiseDivKernel, - ops::ElementwiseDivKernel, - ops::ElementwiseDivKernel, - ops::ElementwiseDivKernel, - ops::ElementwiseDivKernel>, - ops::ElementwiseDivKernel>); -REGISTER_OP_CUDA_KERNEL( - elementwise_div_grad, - ops::ElementwiseDivGradKernel, - ops::ElementwiseDivGradKernel, - ops::ElementwiseDivGradKernel, - ops::ElementwiseDivGradKernel, - ops::ElementwiseDivGradKernel, - ops::ElementwiseDivGradKernel, - ops::ElementwiseDivGradKernel>, - ops::ElementwiseDivGradKernel>); -REGISTER_OP_CUDA_KERNEL( - elementwise_div_grad_grad, - ops::ElementwiseDivDoubleGradKernel, - ops::ElementwiseDivDoubleGradKernel, - ops::ElementwiseDivDoubleGradKernel, - ops::ElementwiseDivDoubleGradKernel, - ops::ElementwiseDivDoubleGradKernel, - ops::ElementwiseDivDoubleGradKernel, - ops::ElementwiseDivDoubleGradKernel>, - ops::ElementwiseDivDoubleGradKernel>); +#define REGISTER_ELEMENTWISEDIV_BASE(op_name, grad, ...) \ + REGISTER_OP_CUDA_KERNEL( \ + op_name, \ + ops::Elementwise##grad##Kernel, \ + ops::Elementwise##grad##Kernel, \ + ops::Elementwise##grad##Kernel, \ + ops::Elementwise##grad##Kernel, \ + ops::Elementwise##grad##Kernel, \ + ops::Elementwise##grad##Kernel>, \ + ops::Elementwise##grad##Kernel>, \ + ##__VA_ARGS__); + +#define REGISTER_ELEMENTWISEDIV_EX(op_name, grad) \ + REGISTER_ELEMENTWISEDIV_BASE( \ + op_name, grad, \ + ops::Elementwise##grad##Kernel) + +#ifndef PADDLE_WITH_HIP +REGISTER_ELEMENTWISEDIV_EX(elementwise_div, Div) +REGISTER_ELEMENTWISEDIV_EX(elementwise_div_grad, DivGrad) +REGISTER_ELEMENTWISEDIV_EX(elementwise_div_grad_grad, DivDoubleGrad) +#else +REGISTER_ELEMENTWISEDIV_BASE(elementwise_div, Div) +REGISTER_ELEMENTWISEDIV_BASE(elementwise_div_grad, DivGrad) +REGISTER_ELEMENTWISEDIV_BASE(elementwise_div_grad_grad, DivDoubleGrad) +#endif From 81582b666d03645220d58661f3ad263283594f01 Mon Sep 17 00:00:00 2001 From: zhangbo9674 Date: Thu, 17 Feb 2022 12:13:12 +0000 Subject: [PATCH 3/8] refine code --- paddle/pten/kernels/gpu/math_kernel.cu | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/paddle/pten/kernels/gpu/math_kernel.cu b/paddle/pten/kernels/gpu/math_kernel.cu index 7ad2effbffb24..625436964d86d 100644 --- a/paddle/pten/kernels/gpu/math_kernel.cu +++ b/paddle/pten/kernels/gpu/math_kernel.cu @@ -92,8 +92,8 @@ DEFINE_CUDA_ELEMENTWISE_OP(Divide) } // namespace pten using float16 = pten::dtype::float16; -using bfloat16 = pten::dtype::bfloat16 using complex64 = - ::pten::dtype::complex; +using bfloat16 = pten::dtype::bfloat16; +using complex64 = ::pten::dtype::complex; using complex128 = ::pten::dtype::complex; PT_REGISTER_KERNEL(add_raw, From 76399294f72d7057617784096132cd19536ed0d3 Mon Sep 17 00:00:00 2001 From: zhangbo9674 Date: Fri, 18 Feb 2022 09:05:07 +0000 Subject: [PATCH 4/8] refine op register --- .../elementwise/elementwise_div_op.cu | 119 +++++++++++++----- 1 file changed, 87 insertions(+), 32 deletions(-) diff --git a/paddle/fluid/operators/elementwise/elementwise_div_op.cu b/paddle/fluid/operators/elementwise/elementwise_div_op.cu index 04ccd022bb89c..922c11355c092 100644 --- a/paddle/fluid/operators/elementwise/elementwise_div_op.cu +++ b/paddle/fluid/operators/elementwise/elementwise_div_op.cu @@ -48,37 +48,92 @@ ElementwiseDivGrad(const framework::ExecutionContext& ctx, } // namespace operators } // namespace paddle -#define REGISTER_ELEMENTWISEDIV_BASE(op_name, grad, ...) \ - REGISTER_OP_CUDA_KERNEL( \ - op_name, \ - ops::Elementwise##grad##Kernel, \ - ops::Elementwise##grad##Kernel, \ - ops::Elementwise##grad##Kernel, \ - ops::Elementwise##grad##Kernel, \ - ops::Elementwise##grad##Kernel, \ - ops::Elementwise##grad##Kernel>, \ - ops::Elementwise##grad##Kernel>, \ - ##__VA_ARGS__); - -#define REGISTER_ELEMENTWISEDIV_EX(op_name, grad) \ - REGISTER_ELEMENTWISEDIV_BASE( \ - op_name, grad, \ - ops::Elementwise##grad##Kernel) - -#ifndef PADDLE_WITH_HIP -REGISTER_ELEMENTWISEDIV_EX(elementwise_div, Div) -REGISTER_ELEMENTWISEDIV_EX(elementwise_div_grad, DivGrad) -REGISTER_ELEMENTWISEDIV_EX(elementwise_div_grad_grad, DivDoubleGrad) +#ifdef PADDLE_WITH_HIP +REGISTER_OP_CUDA_KERNEL( + elementwise_div, + ops::ElementwiseDivKernel, + ops::ElementwiseDivKernel, + ops::ElementwiseDivKernel, + ops::ElementwiseDivKernel, + ops::ElementwiseDivKernel, + ops::ElementwiseDivKernel>, + ops::ElementwiseDivKernel>); +REGISTER_OP_CUDA_KERNEL( + elementwise_div_grad, + ops::ElementwiseDivGradKernel, + ops::ElementwiseDivGradKernel, + ops::ElementwiseDivGradKernel, + ops::ElementwiseDivGradKernel, + ops::ElementwiseDivGradKernel, + ops::ElementwiseDivGradKernel>, + ops::ElementwiseDivGradKernel>); +REGISTER_OP_CUDA_KERNEL( + elementwise_div_grad_grad, + ops::ElementwiseDivDoubleGradKernel, + ops::ElementwiseDivDoubleGradKernel, + ops::ElementwiseDivDoubleGradKernel, + ops::ElementwiseDivDoubleGradKernel, + ops::ElementwiseDivDoubleGradKernel, + ops::ElementwiseDivDoubleGradKernel>, + ops::ElementwiseDivDoubleGradKernel>); #else -REGISTER_ELEMENTWISEDIV_BASE(elementwise_div, Div) -REGISTER_ELEMENTWISEDIV_BASE(elementwise_div_grad, DivGrad) -REGISTER_ELEMENTWISEDIV_BASE(elementwise_div_grad_grad, DivDoubleGrad) +REGISTER_OP_CUDA_KERNEL( + elementwise_div, + ops::ElementwiseDivKernel, + ops::ElementwiseDivKernel, + ops::ElementwiseDivKernel, + ops::ElementwiseDivKernel, + ops::ElementwiseDivKernel, + ops::ElementwiseDivKernel, + ops::ElementwiseDivKernel>, + ops::ElementwiseDivKernel>); +REGISTER_OP_CUDA_KERNEL( + elementwise_div_grad, + ops::ElementwiseDivGradKernel, + ops::ElementwiseDivGradKernel, + ops::ElementwiseDivGradKernel, + ops::ElementwiseDivGradKernel, + ops::ElementwiseDivGradKernel, + ops::ElementwiseDivGradKernel, + ops::ElementwiseDivGradKernel>, + ops::ElementwiseDivGradKernel>); +REGISTER_OP_CUDA_KERNEL( + elementwise_div_grad_grad, + ops::ElementwiseDivDoubleGradKernel, + ops::ElementwiseDivDoubleGradKernel, + ops::ElementwiseDivDoubleGradKernel, + ops::ElementwiseDivDoubleGradKernel, + ops::ElementwiseDivDoubleGradKernel, + ops::ElementwiseDivDoubleGradKernel, + ops::ElementwiseDivDoubleGradKernel>, + ops::ElementwiseDivDoubleGradKernel>); #endif From d0e217c5ca89ff68350b4dca0a1fd33ade9696ac Mon Sep 17 00:00:00 2001 From: zhangbo9674 Date: Mon, 21 Feb 2022 01:50:27 +0000 Subject: [PATCH 5/8] solve conflict --- paddle/phi/kernels/gpu/math_kernel.cu | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/paddle/phi/kernels/gpu/math_kernel.cu b/paddle/phi/kernels/gpu/math_kernel.cu index 84792e4426e91..7825e5eb2cd8b 100644 --- a/paddle/phi/kernels/gpu/math_kernel.cu +++ b/paddle/phi/kernels/gpu/math_kernel.cu @@ -91,16 +91,10 @@ DEFINE_CUDA_ELEMENTWISE_OP(Divide) } // namespace phi -<<<<<<< HEAD:paddle/pten/kernels/gpu/math_kernel.cu -using float16 = pten::dtype::float16; -using bfloat16 = pten::dtype::bfloat16; -using complex64 = ::pten::dtype::complex; -using complex128 = ::pten::dtype::complex; -======= using float16 = phi::dtype::float16; +using bfloat16 = phi::dtype::bfloat16; using complex64 = ::phi::dtype::complex; using complex128 = ::phi::dtype::complex; ->>>>>>> dcfe198631058dbcd4fe6e887a4e514008ed1e68:paddle/phi/kernels/gpu/math_kernel.cu PT_REGISTER_KERNEL(add_raw, GPU, From 996fd81971e590ae0e18ef16654a639c5556fb04 Mon Sep 17 00:00:00 2001 From: zhangbo9674 Date: Mon, 21 Feb 2022 02:23:10 +0000 Subject: [PATCH 6/8] refine unittest --- .../fluid/tests/unittests/test_elementwise_div_op.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_div_op.py b/python/paddle/fluid/tests/unittests/test_elementwise_div_op.py index 7e1e327cc07db..ee189a982154c 100644 --- a/python/paddle/fluid/tests/unittests/test_elementwise_div_op.py +++ b/python/paddle/fluid/tests/unittests/test_elementwise_div_op.py @@ -80,18 +80,15 @@ def test_check_output(self): def test_check_grad_normal(self): place = core.CUDAPlace(0) - self.check_grad_with_place( - place, ['X', 'Y'], 'Out', max_relative_error=1) + self.check_grad_with_place(place, ['X', 'Y'], 'Out') def test_check_grad_ingore_x(self): place = core.CUDAPlace(0) - self.check_grad_with_place( - place, ['Y'], 'Out', max_relative_error=1, no_grad_set=set("X")) + self.check_grad_with_place(place, ['Y'], 'Out', no_grad_set=set("X")) def test_check_grad_ingore_y(self): place = core.CUDAPlace(0) - self.check_grad_with_place( - place, ['X'], 'Out', max_relative_error=1, no_grad_set=set('Y')) + self.check_grad_with_place(place, ['X'], 'Out', no_grad_set=set('Y')) @skip_check_grad_ci( From a4b2d9f9b6fcd732ba4c3eedf89a50566f99b05f Mon Sep 17 00:00:00 2001 From: zhangbo9674 Date: Mon, 21 Feb 2022 06:04:21 +0000 Subject: [PATCH 7/8] refine unittest precision --- python/paddle/fluid/tests/unittests/test_elementwise_div_op.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_div_op.py b/python/paddle/fluid/tests/unittests/test_elementwise_div_op.py index ee189a982154c..a43e56b0815a6 100644 --- a/python/paddle/fluid/tests/unittests/test_elementwise_div_op.py +++ b/python/paddle/fluid/tests/unittests/test_elementwise_div_op.py @@ -76,7 +76,7 @@ def setUp(self): def test_check_output(self): place = core.CUDAPlace(0) - self.check_output_with_place(place, atol=1e-1) + self.check_output_with_place(place) def test_check_grad_normal(self): place = core.CUDAPlace(0) From 911b0a3b61215282051993c3639170cea87ab4ea Mon Sep 17 00:00:00 2001 From: zhangbo9674 Date: Tue, 22 Feb 2022 07:53:53 +0000 Subject: [PATCH 8/8] add rocm --- .../elementwise/elementwise_div_op.cu | 43 ------------------- .../device/gpu/rocm/rocm_device_function.h | 7 +++ 2 files changed, 7 insertions(+), 43 deletions(-) diff --git a/paddle/fluid/operators/elementwise/elementwise_div_op.cu b/paddle/fluid/operators/elementwise/elementwise_div_op.cu index 922c11355c092..9eb4b0352e533 100644 --- a/paddle/fluid/operators/elementwise/elementwise_div_op.cu +++ b/paddle/fluid/operators/elementwise/elementwise_div_op.cu @@ -48,48 +48,6 @@ ElementwiseDivGrad(const framework::ExecutionContext& ctx, } // namespace operators } // namespace paddle -#ifdef PADDLE_WITH_HIP -REGISTER_OP_CUDA_KERNEL( - elementwise_div, - ops::ElementwiseDivKernel, - ops::ElementwiseDivKernel, - ops::ElementwiseDivKernel, - ops::ElementwiseDivKernel, - ops::ElementwiseDivKernel, - ops::ElementwiseDivKernel>, - ops::ElementwiseDivKernel>); -REGISTER_OP_CUDA_KERNEL( - elementwise_div_grad, - ops::ElementwiseDivGradKernel, - ops::ElementwiseDivGradKernel, - ops::ElementwiseDivGradKernel, - ops::ElementwiseDivGradKernel, - ops::ElementwiseDivGradKernel, - ops::ElementwiseDivGradKernel>, - ops::ElementwiseDivGradKernel>); -REGISTER_OP_CUDA_KERNEL( - elementwise_div_grad_grad, - ops::ElementwiseDivDoubleGradKernel, - ops::ElementwiseDivDoubleGradKernel, - ops::ElementwiseDivDoubleGradKernel, - ops::ElementwiseDivDoubleGradKernel, - ops::ElementwiseDivDoubleGradKernel, - ops::ElementwiseDivDoubleGradKernel>, - ops::ElementwiseDivDoubleGradKernel>); -#else REGISTER_OP_CUDA_KERNEL( elementwise_div, ops::ElementwiseDivKernel, @@ -136,4 +94,3 @@ REGISTER_OP_CUDA_KERNEL( paddle::platform::complex>, ops::ElementwiseDivDoubleGradKernel>); -#endif diff --git a/paddle/fluid/platform/device/gpu/rocm/rocm_device_function.h b/paddle/fluid/platform/device/gpu/rocm/rocm_device_function.h index 63897bd671740..61bf1905fdb74 100644 --- a/paddle/fluid/platform/device/gpu/rocm/rocm_device_function.h +++ b/paddle/fluid/platform/device/gpu/rocm/rocm_device_function.h @@ -91,6 +91,13 @@ __forceinline__ __device__ float16 CudaShuffleXorSync(unsigned mask, return float16(__shfl_xor(static_cast(val), width)); } +template <> +__forceinline__ __device__ bfloat16 CudaShuffleXorSync(unsigned mask, + bfloat16 val, + int width) { + return bfloat16(__shfl_xor(static_cast(val), width)); +} + template <> __forceinline__ __device__ paddle::platform::complex CudaShuffleXorSync( unsigned mask, paddle::platform::complex val, int width) {