[ROCM] fix softmax with loss and update python scripts, test=develop

PaddlePaddle · Mar 2, 2021 · d013984 · d013984 · paddle-bot-old · Mar 3, 2021
1 parent 353dd0c
commit d013984
Show file tree

Hide file tree

Showing 10 changed files with 282 additions and 48 deletions.
diff --git a/paddle/fluid/operators/softmax_with_cross_entropy_op.cu b/paddle/fluid/operators/softmax_with_cross_entropy_op.cu
@@ -8,7 +8,13 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include <cub/cub.cuh>
+#ifdef __NVCC__
+#include "cub/cub.cuh"
+#endif
+#ifdef __HIPCC__
+#include <hipcub/hipcub.hpp>
+namespace cub = hipcub;
+#endif
 #include "paddle/fluid/operators/math/cross_entropy.h"
 #include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/softmax_with_cross_entropy_op.h"
@@ -214,6 +220,60 @@ static __global__ void RowReductionForDiffMaxSum(const T* logits_data,
   if (threadIdx.x == 0) max_data[blockIdx.x] = 0;
 }
 
+#ifdef __HIPCC__  // @{ HIP Seperate Kernel for RowReductionForDiffMaxSum
+// Note(qili93): HIP do not support return in kernel, need to seperate
+// RowReductionForDiffMaxSum into two kernels below
+template <typename T, int BlockDim>
+static __global__ void RowReductionForSum(const T* logits_data, T* max_data,
+                                          T* softmax, int64_t d, int axis_dim) {
+  __shared__ BlockReduceTempStorage<T, BlockDim> temp_storage;
+
+  int64_t remain = d / axis_dim;
+  int64_t idx_n = blockIdx.x / remain;
+  int64_t idx_remain = blockIdx.x % remain;
+  int64_t beg_idx = idx_n * d + threadIdx.x * remain + idx_remain;
+  int64_t end_idx = (idx_n + 1) * d;
+
+  auto block_max = max_data[blockIdx.x];
+  int64_t step = BlockDim * remain;
+
+  softmax[beg_idx] = logits_data[beg_idx] - block_max;
+  T diff_max_sum = exp_on_device(softmax[beg_idx]);
+  auto idx = beg_idx + step;
+  while (idx < end_idx) {
+    softmax[idx] = logits_data[idx] - block_max;
+    diff_max_sum += exp_on_device(softmax[idx]);
+    idx += step;
+  }
+
+  diff_max_sum =
+      BlockReduce<T, BlockDim>(temp_storage).Reduce(diff_max_sum, cub::Sum());
+  if (threadIdx.x == 0) max_data[blockIdx.x] = log_on_device(diff_max_sum);
+}
+
+template <typename T, int BlockDim, bool CalculateLogSoftmax = false>
+static __global__ void RowReductionForDiff(const T* logits_data, T* max_data,
+                                           T* softmax, int d, int axis_dim) {
+  int remain = d / axis_dim;
+  int idx_n = blockIdx.x / remain;
+  int idx_remain = blockIdx.x % remain;
+  int beg_idx = idx_n * d + threadIdx.x * remain + idx_remain;
+  int end_idx = (idx_n + 1) * d;
+  int step = BlockDim * remain;
+
+  T diff_max_sum = max_data[blockIdx.x];
+  softmax[beg_idx] -= diff_max_sum;
+  beg_idx += step;
+  while (beg_idx < end_idx) {
+    softmax[beg_idx] -= diff_max_sum;
+    beg_idx += step;
+  }
+
+  __syncthreads();
+  if (threadIdx.x == 0) max_data[blockIdx.x] = 0;
+}
+#endif  // @} End HIP Seperate Kernel for RowReductionForDiffMaxSum
+
 // Make sure that BlockDim <= axis_dim
 template <typename T, int BlockDim>
 static __global__ void RowReductionForSoftmaxAndCrossEntropy(
@@ -345,6 +405,28 @@ static void HardLabelSoftmaxWithCrossEntropy(
   int64_t grid_dim = n * d / axis_dim;
   auto stream = ctx.stream();
 
+#ifdef __HIPCC__
+#define CALL_HARD_LABEL_SOFTMAX_WITH_CROSS_ENTROPY_FUSED_KERNEL(BlockDim)      \
+  case BlockDim: {                                                             \
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(RowReductionForMax<T, BlockDim>),       \
+                       dim3(grid_dim), dim3(BlockDim), 0, stream, logits_data, \
+                       loss_data, d, axis_dim);                                \
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(RowReductionForSum<T, BlockDim>),       \
+                       dim3(grid_dim), dim3(BlockDim), 0, stream, logits_data, \
+                       loss_data, softmax_data, d, axis_dim);                  \
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(RowReductionForDiff<T, BlockDim>),      \
+                       dim3(grid_dim), dim3(BlockDim), 0, stream, logits_data, \
+                       loss_data, softmax_data, d, axis_dim);                  \
+    platform::ForRange<platform::CUDADeviceContext> for_range(ctx, n* d);      \
+    if (ignore_idx >= 0 && ignore_idx < axis_dim) {                            \
+      for_range(HardLabelSoftmaxWithCrossEntropyFunctorWithIgnoreIdx<T>(       \
+          labels_data, loss_data, softmax_data, d, axis_dim, ignore_idx));     \
+    } else {                                                                   \
+      for_range(HardLabelSoftmaxWithCrossEntropyFunctor<T>(                    \
+          labels_data, loss_data, softmax_data, d, axis_dim, ignore_idx));     \
+    }                                                                          \
+  } break
+#else
 #define CALL_HARD_LABEL_SOFTMAX_WITH_CROSS_ENTROPY_FUSED_KERNEL(BlockDim)  \
   case BlockDim: {                                                         \
     RowReductionForMax<T, BlockDim><<<grid_dim, BlockDim, 0, stream>>>(    \
@@ -361,6 +443,7 @@ static void HardLabelSoftmaxWithCrossEntropy(
           labels_data, loss_data, softmax_data, d, axis_dim, ignore_idx)); \
     }                                                                      \
   } break
+#endif
 
   switch (block_dim) {
     CALL_HARD_LABEL_SOFTMAX_WITH_CROSS_ENTROPY_FUSED_KERNEL(512);
@@ -383,13 +466,27 @@ static void HardLabelSoftmaxWithCrossEntropy(
 template <typename T>
 static void SoftmaxWithCrossEntropyFusedKernel(
     const T* logits_data, const T* labels_data, T* softmax_data, T* loss_data,
-    int64_t n, int64_t d, int axis_dim, cudaStream_t stream) {
+    int64_t n, int64_t d, int axis_dim, gpuStream_t stream) {
   constexpr int kMaxBlockDim = 512;
   int64_t block_dim = axis_dim >= kMaxBlockDim
                           ? kMaxBlockDim
                           : (1 << static_cast<int>(std::log2(axis_dim)));
   int64_t grid_dim = n * d / axis_dim;
-
+#ifdef __HIPCC__
+#define CALL_SOFTMAX_WITH_CROSS_ENTROPY_FUSED_KERNEL(BlockDim)                 \
+  case BlockDim:                                                               \
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(RowReductionForMax<T, BlockDim>),       \
+                       dim3(grid_dim), dim3(BlockDim), 0, stream, logits_data, \
+                       loss_data, d, axis_dim);                                \
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(RowReductionForSum<T, BlockDim>),       \
+                       dim3(grid_dim), dim3(BlockDim), 0, stream, logits_data, \
+                       loss_data, softmax_data, d, axis_dim);                  \
+    hipLaunchKernelGGL(                                                        \
+        HIP_KERNEL_NAME(RowReductionForSoftmaxAndCrossEntropy<T, BlockDim>),   \
+        dim3(grid_dim), dim3(BlockDim), 0, stream, logits_data, labels_data,   \
+        loss_data, softmax_data, d, axis_dim);                                 \
+    break
+#else
 #define CALL_SOFTMAX_WITH_CROSS_ENTROPY_FUSED_KERNEL(BlockDim)                 \
   case BlockDim:                                                               \
     RowReductionForMax<T, BlockDim><<<grid_dim, BlockDim, 0, stream>>>(        \
@@ -400,6 +497,7 @@ static void SoftmaxWithCrossEntropyFusedKernel(
         T, BlockDim><<<grid_dim, BlockDim, 0, stream>>>(                       \
         logits_data, labels_data, loss_data, softmax_data, d, axis_dim);       \
     break
+#endif
 
   switch (block_dim) {
     CALL_SOFTMAX_WITH_CROSS_ENTROPY_FUSED_KERNEL(512);
@@ -536,6 +634,16 @@ class SoftmaxWithCrossEntropyGradCUDAKernel : public framework::OpKernel<T> {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
+#ifdef PADDLE_WITH_HIP
+// MIOPEN do not support double
+REGISTER_OP_CUDA_KERNEL(
+    softmax_with_cross_entropy, ops::SoftmaxWithCrossEntropyCUDAKernel<float>,
+    ops::SoftmaxWithCrossEntropyCUDAKernel<paddle::platform::float16>);
+REGISTER_OP_CUDA_KERNEL(
+    softmax_with_cross_entropy_grad,
+    ops::SoftmaxWithCrossEntropyGradCUDAKernel<float>,
+    ops::SoftmaxWithCrossEntropyGradCUDAKernel<paddle::platform::float16>);
+#else
 REGISTER_OP_CUDA_KERNEL(
     softmax_with_cross_entropy, ops::SoftmaxWithCrossEntropyCUDAKernel<float>,
     ops::SoftmaxWithCrossEntropyCUDAKernel<paddle::platform::float16>,
@@ -545,3 +653,4 @@ REGISTER_OP_CUDA_KERNEL(
     ops::SoftmaxWithCrossEntropyGradCUDAKernel<float>,
     ops::SoftmaxWithCrossEntropyGradCUDAKernel<paddle::platform::float16>,
     ops::SoftmaxWithCrossEntropyGradCUDAKernel<double>);
+#endif
diff --git a/paddle/fluid/platform/for_range.h b/paddle/fluid/platform/for_range.h
@@ -62,7 +62,12 @@ struct ForRange<CUDADeviceContext> {
 
   template <typename Function>
   inline void operator()(Function func) const {
+#ifdef __HIPCC__
+    // HIP will throw core dump when threads > 256
+    constexpr int num_threads = 256;
+#else
     constexpr int num_threads = 1024;
+#endif
     size_t block_size = limit_ <= num_threads ? limit_ : num_threads;
     size_t grid_size = (limit_ + num_threads - 1) / num_threads;
 

diff --git a/python/paddle/fluid/tests/unittests/test_activation_op.py b/python/paddle/fluid/tests/unittests/test_activation_op.py
@@ -91,7 +91,11 @@ def test_dygraph(self):
             x = fluid.dygraph.to_variable(np_x)
             z = eval("paddle.%s(x).numpy()" % self.op_type)
             z_expected = eval("np.%s(np_x)" % self.op_type)
-            self.assertEqual(z, z_expected)
+            # ROCM platform will fail in assertEqual
+            if core.is_compiled_with_rocm():
+                self.assertTrue(np.allclose(z, z_expected))
+            else:
+                self.assertEqual(z, z_expected)
 
 
 class TestSigmoid(TestActivation):
@@ -2651,7 +2655,10 @@ def test_check_grad(self):
 create_test_act_fp16_class(TestELU)
 create_test_act_fp16_class(TestReciprocal)
 create_test_act_fp16_class(TestLog)
-create_test_act_fp16_class(TestLog2, atol=5e-2)
+if core.is_compiled_with_rocm():
+    create_test_act_fp16_class(TestLog2, atol=5e-2, grad_atol=0.85)
+else:
+    create_test_act_fp16_class(TestLog2, atol=5e-2)
 create_test_act_fp16_class(TestLog10, atol=5e-2)
 create_test_act_fp16_class(TestLog1p, grad_atol=0.9)
 create_test_act_fp16_class(TestSquare)

diff --git a/python/paddle/fluid/tests/unittests/test_batch_norm_op_v2.py b/python/paddle/fluid/tests/unittests/test_batch_norm_op_v2.py
@@ -171,7 +171,11 @@ def compute_v2(x_np):
 class TestBatchNormChannelLast(unittest.TestCase):
     def setUp(self):
         self.original_dtyep = paddle.get_default_dtype()
-        paddle.set_default_dtype("float64")
+        # MIOPEN not support data type of double
+        if core.is_compiled_with_rocm():
+            paddle.set_default_dtype("float32")
+        else:
+            paddle.set_default_dtype("float64")
         self.places = [fluid.CPUPlace()]
         if core.is_compiled_with_cuda() and core.op_support_gpu("batch_norm"):
             self.places.append(fluid.CUDAPlace(0))
@@ -219,7 +223,13 @@ def test_3d(self):
                 channel_first_x = paddle.transpose(x, [0, 4, 1, 2, 3])
                 y2 = net2(channel_first_x)
                 y2 = paddle.transpose(y2, [0, 2, 3, 4, 1])
-                self.assertEqual(np.allclose(y1.numpy(), y2.numpy()), True)
+                if core.is_compiled_with_rocm():
+                    # HIP will fail if no atol
+                    self.assertEqual(
+                        np.allclose(
+                            y1.numpy(), y2.numpy(), atol=1e-07), True)
+                else:
+                    self.assertEqual(np.allclose(y1.numpy(), y2.numpy()), True)
 
 
 class TestBatchNormUseGlobalStats(unittest.TestCase):

diff --git a/python/paddle/fluid/tests/unittests/test_conv2d_op.py b/python/paddle/fluid/tests/unittests/test_conv2d_op.py
@@ -298,7 +298,8 @@ def setUp(self):
         self.use_mkldnn = False
         self.fuse_relu_before_depthwise_conv = False
         self.data_format = "AnyLayout"
-        self.dtype = np.float64
+        # explicilty use float32 for ROCm, as MIOpen does not yet support float64
+        self.dtype = np.float32 if core.is_compiled_with_rocm() else np.float64
         self.init_kernel_type()
         self.init_group()
         self.init_dilation()
@@ -732,7 +733,8 @@ def setUp(self):
         self.use_cuda = False
         self.use_mkldnn = False
         self.fuse_relu_before_depthwise_conv = False
-        self.dtype = np.float64
+        # explicilty use float32 for ROCm, as MIOpen does not yet support float64
+        self.dtype = np.float32 if core.is_compiled_with_rocm() else np.float64
         self.init_kernel_type()
         self.init_group()
         self.init_dilation()

diff --git a/python/paddle/fluid/tests/unittests/test_pool2d_op.py b/python/paddle/fluid/tests/unittests/test_pool2d_op.py
@@ -41,6 +41,8 @@ def max_pool2D_forward_naive(x,
                              exclusive=True,
                              adaptive=False,
                              data_type=np.float64):
+    if data_type == np.float64 and core.is_compiled_with_rocm():
+        data_type = np.float32
     N, C, H, W = x.shape
     if global_pool == 1:
         ksize = [H, W]
@@ -81,6 +83,8 @@ def avg_pool2D_forward_naive(x,
                              exclusive=True,
                              adaptive=False,
                              data_type=np.float64):
+    if data_type == np.float64 and core.is_compiled_with_rocm():
+        data_type = np.float32
     N, C, H, W = x.shape
     if global_pool == 1:
         ksize = [H, W]
@@ -340,7 +344,7 @@ def init_kernel_type(self):
         self.use_cudnn = False
 
     def init_data_type(self):
-        self.dtype = np.float64
+        self.dtype = np.float32 if core.is_compiled_with_rocm() else np.float64
 
     def init_pool_type(self):
         self.pool_type = "avg"

diff --git a/python/paddle/fluid/tests/unittests/test_softmax_op.py b/python/paddle/fluid/tests/unittests/test_softmax_op.py
@@ -55,7 +55,8 @@ def setUp(self):
         self.op_type = "softmax"
         self.use_cudnn = False
         self.use_mkldnn = False
-        self.dtype = np.float64
+        # explicilty use float32 for ROCm, as MIOpen does not yet support float64
+        self.dtype = np.float32 if core.is_compiled_with_rocm() else np.float64
         self.init_kernel_type()
         self.shape = self.get_x_shape()
         self.axis = self.get_axis()
@@ -338,8 +339,13 @@ def test_dygraph_check(self):
         for r in [out1, out2]:
             self.assertEqual(np.allclose(out_ref, r.numpy()), True)
 
-        out = self.softmax(x, dtype=np.float64)
-        out_ref = ref_softmax(self.x_np, axis=-1, dtype=np.float64)
+        # explicilty use float32 for ROCm, as MIOpen does not yet support float64
+        if core.is_compiled_with_rocm():
+            out = self.softmax(x, dtype=np.float32)
+            out_ref = ref_softmax(self.x_np, axis=-1, dtype=np.float32)
+        else:
+            out = self.softmax(x, dtype=np.float64)
+            out_ref = ref_softmax(self.x_np, axis=-1, dtype=np.float64)
         self.assertEqual(np.allclose(out_ref, out.numpy()), True)
 
         paddle.enable_static()