diff --git a/paddle/phi/kernels/dist_grad_kernel.cc b/paddle/phi/kernels/dist_grad_kernel.cc index 3c7ffc01edf07..17c24fa905b5c 100644 --- a/paddle/phi/kernels/dist_grad_kernel.cc +++ b/paddle/phi/kernels/dist_grad_kernel.cc @@ -98,11 +98,6 @@ PD_REGISTER_KERNEL( dist_grad, CPU, ALL_LAYOUT, phi::DistGradKernel, float, double) {} #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) -PD_REGISTER_KERNEL(dist_grad, - GPU, - ALL_LAYOUT, - phi::DistGradKernel, - phi::dtype::float16, - float, - double) {} +PD_REGISTER_KERNEL( + dist_grad, GPU, ALL_LAYOUT, phi::DistGradKernel, float, double) {} #endif diff --git a/paddle/phi/kernels/funcs/math_cuda_utils.h b/paddle/phi/kernels/funcs/math_cuda_utils.h index 26ead61e4e5f9..b493e2ac41bfd 100644 --- a/paddle/phi/kernels/funcs/math_cuda_utils.h +++ b/paddle/phi/kernels/funcs/math_cuda_utils.h @@ -23,9 +23,6 @@ limitations under the License. */ #include -#include "paddle/phi/backends/gpu/gpu_device_function.h" -#include "paddle/phi/common/data_type.h" - namespace phi { namespace funcs { @@ -173,7 +170,11 @@ struct KeyValuePair { template __inline__ __device__ T WarpReduceSum(T val, unsigned lane_mask) { for (int mask = HALF_WARP; mask > 0; mask >>= 1) - val += phi::backends::gpu::CudaShuffleXorSync(lane_mask, val, mask); +#if defined(PADDLE_WITH_CUDA) && (__CUDA_ARCH__ >= 350 && CUDA_VERSION >= 9000) + val += __shfl_xor_sync(lane_mask, val, mask, warpSize); +#else + val += __shfl_xor(val, mask, warpSize); +#endif return val; } @@ -242,8 +243,11 @@ __inline__ __device__ T BlockReduceSumV2(T *val) { template __inline__ __device__ T WarpReduceMax(T val, unsigned lane_mask) { for (int mask = HALF_WARP; mask > 0; mask >>= 1) - val = std::max( - val, phi::backends::gpu::CudaShuffleXorSync(lane_mask, val, mask)); +#if defined(PADDLE_WITH_CUDA) && (__CUDA_ARCH__ >= 350 && CUDA_VERSION >= 9000) + val = max(val, __shfl_xor_sync(lane_mask, val, mask, warpSize)); +#else + val = max(val, __shfl_xor(val, mask, warpSize)); +#endif return val; } @@ -261,8 +265,11 @@ __inline__ __device__ T WarpReduceMaxV2(T *val) { template __inline__ __device__ T WarpReduceMin(T val, unsigned lane_mask) { for (int mask = HALF_WARP; mask > 0; mask >>= 1) - val = std::min( - val, phi::backends::gpu::CudaShuffleXorSync(lane_mask, val, mask)); +#if defined(PADDLE_WITH_CUDA) && (__CUDA_ARCH__ >= 350 && CUDA_VERSION >= 9000) + val = min(val, __shfl_xor_sync(lane_mask, val, mask, warpSize)); +#else + val = min(val, __shfl_xor(val, mask, warpSize)); +#endif return val; } @@ -303,7 +310,7 @@ __inline__ __device__ T BlockReduceMax(T val, unsigned mask) { // align block_span to warpSize int block_span = (blockDim.x + warpSize - 1) >> 5; - val = (lane < block_span) ? shared[lane] : std::numeric_limits::min(); + val = (lane < block_span) ? shared[lane] : -1e10f; val = WarpReduceMax(val, mask); return val; @@ -351,7 +358,7 @@ __inline__ __device__ T BlockReduceMin(T val, unsigned mask) { // align block_span to warpSize int block_span = (blockDim.x + warpSize - 1) >> 5; - val = (lane < block_span) ? shared[lane] : std::numeric_limits::max(); + val = (lane < block_span) ? shared[lane] : 1e10f; val = WarpReduceMin(val, mask); return val; diff --git a/paddle/phi/kernels/gpu/dist_kernel.cu b/paddle/phi/kernels/gpu/dist_kernel.cu index 0a197053e1016..5040be8eaaca7 100644 --- a/paddle/phi/kernels/gpu/dist_kernel.cu +++ b/paddle/phi/kernels/gpu/dist_kernel.cu @@ -12,12 +12,9 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include - +#include "paddle/phi/kernels/dist_kernel.h" #include "paddle/phi/backends/gpu/gpu_launch_config.h" -#include "paddle/phi/common/float16.h" #include "paddle/phi/core/kernel_registry.h" -#include "paddle/phi/kernels/dist_kernel.h" #include "paddle/phi/kernels/elementwise_subtract_kernel.h" #include "paddle/phi/kernels/funcs/math_cuda_utils.h" #include "paddle/phi/kernels/gpu/reduce.h" @@ -27,56 +24,47 @@ namespace phi { #define FULL_MASK 0xffffffff -template +template struct ZeroOrderFunctor { - HOSTDEVICE explicit inline ZeroOrderFunctor() {} - - HOSTDEVICE inline Ty operator()(const Tx& x, const Tx& y) const { - return static_cast(x != y); + public: + __device__ T operator()(const T& x, const T& y) const { + return static_cast((x - y) != 0); } }; -template +template struct OtherOrderFunctor { - HOSTDEVICE explicit inline OtherOrderFunctor(const Ty& _p_order) - : p_order(_p_order) {} - - HOSTDEVICE inline Ty operator()(const Tx& x, const Tx& y) const { - return static_cast( - pow(abs(static_cast(x) - static_cast(y)), p_order)); + explicit OtherOrderFunctor(const T& p_order) : p_order_(p_order) {} + __device__ T operator()(const T& x, const T& y) const { + return static_cast(pow(abs(x - y), p_order_)); } private: - Ty p_order; + T p_order_; }; -template +template struct PowFunctor { - HOSTDEVICE explicit inline PowFunctor(const Ty& _p_order) - : p_order(_p_order) {} - - HOSTDEVICE inline Tx operator()(const Tx x) const { - return static_cast(pow(static_cast(x), p_order)); + explicit PowFunctor(const T& p_order) : p_order_(p_order) {} + HOSTDEVICE inline T operator()(const T x) const { + return static_cast(pow(x, p_order_)); } - - private: - Ty p_order; + T p_order_; }; template __global__ void ReduceSumWithSubtract( const T* x, const T* y, T* out, int64_t N, Functor func) { - using MT = typename phi::dtype::MPTypeTrait::Type; - MT sum_val(0.0); + T sum_val = 0; for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N; i += blockDim.x * gridDim.x) { - sum_val += static_cast(func(x[i], y[i])); + sum_val += func(x[i], y[i]); } __syncthreads(); - sum_val = phi::funcs::BlockReduceSum(sum_val, FULL_MASK); + sum_val = phi::funcs::BlockReduceSum(sum_val, FULL_MASK); if (threadIdx.x == 0) { - out[blockIdx.x] = static_cast(sum_val); + out[blockIdx.x] = sum_val; } } @@ -85,10 +73,10 @@ __global__ void ReduceMaxWithSubtract(const T* x, const T* y, T* out, int64_t N) { - T max_val = std::numeric_limits::min(); + T max_val = -1e10f; for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N; i += blockDim.x * gridDim.x) { - max_val = std::max(max_val, abs(x[i] - y[i])); + max_val = max(max_val, abs(x[i] - y[i])); } __syncthreads(); @@ -103,10 +91,10 @@ __global__ void ReduceMinWithSubtract(const T* x, const T* y, T* out, int64_t N) { - T min_val = std::numeric_limits::max(); + T min_val = 1e10f; for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N; i += blockDim.x * gridDim.x) { - min_val = std::min(min_val, abs(x[i] - y[i])); + min_val = min(min_val, abs(x[i] - y[i])); } __syncthreads(); @@ -122,7 +110,6 @@ void DistKernel(const Context& dev_ctx, const DenseTensor& y, float p, DenseTensor* out) { - using MT = typename phi::dtype::MPTypeTrait::Type; DenseTensor intermediate; const T* x_ptr = x.data(); const T* y_ptr = y.data(); @@ -144,8 +131,9 @@ void DistKernel(const Context& dev_ctx, ReduceSumWithSubtract <<>>( x_ptr, y_ptr, i_ptr, n, ZeroOrderFunctor()); - phi::funcs::ReduceKernel>( - dev_ctx, intermediate, out, kps::IdentityFunctor(), reduce_axis); + phi::funcs::ReduceKernel>( + dev_ctx, intermediate, out, kps::IdentityFunctor(), reduce_axis); + } else if (p == INFINITY) { ReduceMaxWithSubtract <<>>( @@ -162,19 +150,19 @@ void DistKernel(const Context& dev_ctx, dev_ctx, intermediate, out, kps::IdentityFunctor(), reduce_axis); } else { - MT p_order = static_cast(p); + T p_order = static_cast(p); ReduceSumWithSubtract <<>>( - x_ptr, y_ptr, i_ptr, n, OtherOrderFunctor(p_order)); - phi::funcs::ReduceKernel>( - dev_ctx, intermediate, out, kps::IdentityFunctor(), reduce_axis); + x_ptr, y_ptr, i_ptr, n, OtherOrderFunctor(p_order)); + phi::funcs::ReduceKernel>( + dev_ctx, intermediate, out, kps::IdentityFunctor(), reduce_axis); const DenseTensor* tmp_norm = out; std::vector ins = {tmp_norm}; std::vector outs = {out}; - MT p_order_ = static_cast(static_cast(1.) / p_order); + T p_order_ = static_cast(1. / p_order); phi::funcs::ElementwiseKernel( - dev_ctx, ins, &outs, PowFunctor(p_order_)); + dev_ctx, ins, &outs, PowFunctor(p_order_)); } } else { @@ -185,10 +173,4 @@ void DistKernel(const Context& dev_ctx, } // namespace phi -PD_REGISTER_KERNEL(dist, - GPU, - ALL_LAYOUT, - phi::DistKernel, - phi::dtype::float16, - float, - double) {} +PD_REGISTER_KERNEL(dist, GPU, ALL_LAYOUT, phi::DistKernel, float, double) {} diff --git a/python/paddle/fluid/tests/unittests/test_dist_op.py b/python/paddle/fluid/tests/unittests/test_dist_op.py index 3654da4e00fae..96c0de915cff2 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_op.py +++ b/python/paddle/fluid/tests/unittests/test_dist_op.py @@ -158,46 +158,6 @@ def init_case(self): self.p = 1.5 -class TestDistFP16Op(OpTest): - def init_data_type(self): - self.data_type = 'float16' - - -class TestDistFP16OpCase1(TestDistFP16Op): - def init_case(self): - self.x_shape = (3, 5, 5, 6) - self.y_shape = (5, 5, 6) - self.p = 1.0 - - -class TestDistFP16OpCase2(TestDistFP16Op): - def init_case(self): - self.x_shape = (10, 10) - self.y_shape = (4, 10, 10) - self.p = 2.0 - - -class TestDistFP16OpCase3(TestDistFP16Op): - def init_case(self): - self.x_shape = (15, 10) - self.y_shape = (15, 10) - self.p = float("inf") - - -class TestDistFP16OpCase4(TestDistFP16Op): - def init_case(self): - self.x_shape = (2, 3, 4, 5, 8) - self.y_shape = (3, 1, 5, 8) - self.p = float("-inf") - - -class TestDistFP16OpCase5(TestDistFP16Op): - def init_case(self): - self.x_shape = (4, 1, 4, 8) - self.y_shape = (2, 2, 1, 4, 4, 8) - self.p = 1.5 - - class TestDistAPI(unittest.TestCase): def init_data_type(self): self.data_type = ( diff --git a/python/paddle/tensor/linalg.py b/python/paddle/tensor/linalg.py index 57550c86368a2..2235cf93cfb60 100644 --- a/python/paddle/tensor/linalg.py +++ b/python/paddle/tensor/linalg.py @@ -675,8 +675,8 @@ def dist(x, y, p=2, name=None): ||z||_{p}=(\sum_{i=1}^{m}|z_i|^p)^{\\frac{1}{p}} Args: - x (Tensor): 1-D to 6-D Tensor, its data type is float16, float32 or float64. - y (Tensor): 1-D to 6-D Tensor, its data type is float16, float32 or float64. + x (Tensor): 1-D to 6-D Tensor, its data type is float32 or float64. + y (Tensor): 1-D to 6-D Tensor, its data type is float32 or float64. p (float, optional): The norm to be computed, its data type is float32 or float64. Default: 2. name (str, optional): The default value is `None`. Normally there is no need for user to set this property. For more information, please refer to :ref:`api_guide_Name`. @@ -706,12 +706,8 @@ def dist(x, y, p=2, name=None): if in_dygraph_mode(): return _C_ops.dist(x, y, p) - check_variable_and_dtype( - x, 'dtype', ['float16', 'float32', 'float64'], 'dist' - ) - check_variable_and_dtype( - y, 'dtype', ['float16', 'float32', 'float64'], 'dist' - ) + check_variable_and_dtype(x, 'dtype', ['float32', 'float64'], 'dist') + check_variable_and_dtype(y, 'dtype', ['float32', 'float64'], 'dist') check_type(p, 'p', (float, int), 'dist') helper = LayerHelper("dist", **locals()) out = helper.create_variable_for_type_inference(x.dtype)