PaddlePaddle · ZzSean · Mar 14, 2022 · Jan 17, 2022 · Jan 18, 2022 · Feb 9, 2022
diff --git a/paddle/fluid/operators/log_softmax_op.cc b/paddle/fluid/operators/log_softmax_op.cc
@@ -16,6 +16,7 @@ limitations under the License. */
 #include <string>
 #include <unordered_map>
 #include "paddle/fluid/operators/common_infer_shape_functions.h"
+#include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
 
 namespace paddle {
 namespace operators {
@@ -31,17 +32,25 @@ class LogSoftmaxOp : public framework::OperatorWithKernel {
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
-    auto input_data_type =
-        framework::OperatorWithKernel::IndicateVarDataType(ctx, "X");
-
+    // choose cudnn kernel if the runtime supported.
+    framework::LibraryType library{framework::LibraryType::kPlain};
+    framework::DataLayout layout = framework::DataLayout::kAnyLayout;
+    auto input_data_type = OperatorWithKernel::IndicateVarDataType(ctx, "X");
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+    if (platform::CanCUDNNBeUsed(ctx)) {
+      library = framework::LibraryType::kCUDNN;
+    }
+#endif
 #ifdef PADDLE_WITH_MKLDNN
-    if (this->CanMKLDNNBeUsed(ctx, input_data_type)) {
-      return framework::OpKernelType(input_data_type, ctx.GetPlace(),
-                                     framework::DataLayout::kMKLDNN,
-                                     framework::LibraryType::kMKLDNN);
+    if (library == framework::LibraryType::kPlain &&
+        this->CanMKLDNNBeUsed(ctx, input_data_type)) {
+      library = framework::LibraryType::kMKLDNN;
+      layout = framework::DataLayout::kMKLDNN;
     }
 #endif
-    return framework::OpKernelType(input_data_type, ctx.GetPlace());
+    return framework::OpKernelType(input_data_type, ctx.GetPlace(), layout,
+                                   library);
   }
 };
 
@@ -56,6 +65,11 @@ class LogSoftmaxOpMaker : public framework::OpProtoAndCheckerMaker {
                  "The dimension index of Input(x) to perform log_softmax,"
                  "default -1 for last dimension")
         .SetDefault(-1);
+    AddAttr<bool>(
+        "use_cudnn",
+        "(bool, default false) Only used in cudnn kernel, need install cudnn")
+        .SetDefault(false)
+        .AsExtra();
     AddAttr<bool>("use_mkldnn",
                   "(bool, default false) Only used in mkldnn kernel")
         .SetDefault(false)
@@ -98,9 +112,18 @@ class LogSoftmaxGradOp : public framework::OperatorWithKernel {
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(OperatorWithKernel::IndicateVarDataType(
-                                       ctx, framework::GradVarName("Out")),
-                                   ctx.device_context());
+    // choose cudnn kernel if the runtime supported.
+    framework::LibraryType library{framework::LibraryType::kPlain};
+    framework::DataLayout layout = framework::DataLayout::kAnyLayout;
+    auto input_data_type = OperatorWithKernel::IndicateVarDataType(
+        ctx, framework::GradVarName("Out"));
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+    if (platform::CanCUDNNBeUsed(ctx)) {
+      library = framework::LibraryType::kCUDNN;
+    }
+#endif
+    return framework::OpKernelType(input_data_type, ctx.device_context(),
+                                   layout, library);
   }
 };
 

diff --git a/paddle/fluid/operators/log_softmax_op.cu b/paddle/fluid/operators/log_softmax_op.cu
@@ -18,10 +18,13 @@
 #include "paddle/phi/common/amp_type_traits.h"
 #include "paddle/phi/kernels/funcs/elementwise_functor.h"
 #include "paddle/phi/kernels/funcs/functors.h"
+#include "paddle/phi/kernels/gpudnn/softmax_gpudnn.h"
 
 namespace paddle {
 namespace operators {
 
+using Tensor = framework::Tensor;
+
 #define LAUNCH_WARP_FORWAR_COMPUTE(near_greater_power_of_two)                \
   case near_greater_power_of_two:                                            \
     ComputeLogSoftmaxForwardInWarp<                                          \
@@ -468,6 +471,36 @@ class LogSoftmaxGradKernel<platform::CUDADeviceContext, T>
   }
 };
 
+template <typename T>
+class LogSoftmaxCUDNNKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    auto *x = ctx.Input<Tensor>("X");
+    auto *out = ctx.Output<Tensor>("Out");
+    out->mutable_data<T>(ctx.GetPlace());
+
+    int input_axis = ctx.Attr<int>("axis");
+    auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+    phi::SoftmaxForwardCUDAKernelDriver<T, true>(dev_ctx, *x, input_axis, out);
+  }
+};
+
+template <typename T>
+class LogSoftmaxGradCUDNNKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    auto *out = ctx.Input<Tensor>("Out");
+    auto *dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
+    auto *dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+    dx->mutable_data<T>(ctx.GetPlace());
+
+    int input_axis = ctx.Attr<int>("axis");
+    auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+    phi::SoftmaxBackwardCUDAKernelDriver<T, true>(dev_ctx, *out, *dout,
+                                                  input_axis, dx);
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle
 
@@ -483,3 +516,35 @@ REGISTER_OP_CUDA_KERNEL(
     ops::LogSoftmaxGradKernel<plat::CUDADeviceContext, double>,
     ops::LogSoftmaxGradKernel<plat::CUDADeviceContext, plat::float16>,
     ops::LogSoftmaxGradKernel<plat::CUDADeviceContext, plat::bfloat16>);
+#ifdef PADDLE_WITH_HIP
+REGISTER_OP_KERNEL(log_softmax, CUDNN, plat::CUDAPlace,
+                   ops::LogSoftmaxCUDNNKernel<float>,
+                   ops::LogSoftmaxCUDNNKernel<plat::float16>,
+                   ops::LogSoftmaxCUDNNKernel<plat::bfloat16>);
+REGISTER_OP_KERNEL(log_softmax_grad, CUDNN, plat::CUDAPlace,
+                   ops::LogSoftmaxGradCUDNNKernel<float>,
+                   ops::LogSoftmaxGradCUDNNKernel<plat::float16>,
+                   ops::LogSoftmaxGradCUDNNKernel<plat::bfloat16>);
+#else
+#if CUDNN_VERSION_MIN(8, 1, 0)
+REGISTER_OP_KERNEL(log_softmax, CUDNN, plat::CUDAPlace,
+                   ops::LogSoftmaxCUDNNKernel<float>,
+                   ops::LogSoftmaxCUDNNKernel<double>,
+                   ops::LogSoftmaxCUDNNKernel<plat::float16>,
+                   ops::LogSoftmaxCUDNNKernel<plat::bfloat16>);
+REGISTER_OP_KERNEL(log_softmax_grad, CUDNN, plat::CUDAPlace,
+                   ops::LogSoftmaxGradCUDNNKernel<float>,
+                   ops::LogSoftmaxGradCUDNNKernel<double>,
+                   ops::LogSoftmaxGradCUDNNKernel<plat::float16>,
+                   ops::LogSoftmaxGradCUDNNKernel<plat::bfloat16>);
+#else
+REGISTER_OP_KERNEL(log_softmax, CUDNN, plat::CUDAPlace,
+                   ops::LogSoftmaxCUDNNKernel<float>,
+                   ops::LogSoftmaxCUDNNKernel<double>,
+                   ops::LogSoftmaxCUDNNKernel<plat::float16>);
+REGISTER_OP_KERNEL(log_softmax_grad, CUDNN, plat::CUDAPlace,
+                   ops::LogSoftmaxGradCUDNNKernel<float>,
+                   ops::LogSoftmaxGradCUDNNKernel<double>,
+                   ops::LogSoftmaxGradCUDNNKernel<plat::float16>);
+#endif
+#endif
diff --git a/paddle/fluid/operators/unity_build_rule.cmake b/paddle/fluid/operators/unity_build_rule.cmake
@@ -142,7 +142,6 @@ register_unity_group(cc
     lod_rank_table_op.cc
     lod_reset_op.cc
     lod_tensor_to_array_op.cc
-    log_softmax_op.cc
     lookup_table_dequant_op.cc
     lrn_op.cc
     mkldnn/lrn_mkldnn_op.cc

diff --git a/paddle/phi/kernels/gpudnn/softmax_gpudnn.h b/paddle/phi/kernels/gpudnn/softmax_gpudnn.h
@@ -351,8 +351,17 @@ __global__ void WarpSoftmaxForward(T* softmax,
     VecT* softmax_v =
         reinterpret_cast<VecT*>(&softmax[(first_batch + i) * stride]);
     VecT* reg_v = reinterpret_cast<VecT*>(&out_tmp[i][0][0]);
-    kps::ElementwiseUnary<AccT, T, kVItem, 1, 1, UnaryDivFunctor<AccT>>(
-        &out_tmp[i][0][0], &srcdata[i][0][0], UnaryDivFunctor<AccT>(sum[i]));
+    if (LogMode) {
+      kps::ElementwiseUnary<AccT, AccT, kVItem, 1, 1, UnaryLogFunctor<AccT>>(
+          &srcdata[i][0][0], &srcdata[i][0][0], UnaryLogFunctor<AccT>());
+      kps::ElementwiseUnary<AccT, T, kVItem, 1, 1, UnarySubFunctor<AccT>>(
+          &out_tmp[i][0][0],
+          &srcdata[i][0][0],
+          UnarySubFunctor<AccT>(std::log(sum[i])));
+    } else {
+      kps::ElementwiseUnary<AccT, T, kVItem, 1, 1, UnaryDivFunctor<AccT>>(
+          &out_tmp[i][0][0], &srcdata[i][0][0], UnaryDivFunctor<AccT>(sum[i]));
+    }
     kps::WriteData<VecT, VecT, kLoopsV, 1, 1, true>(
         &softmax_v[0], &reg_v[0], idx_max_v[i], 0, kWarpSize, 1);
   }
@@ -434,15 +443,25 @@ __global__ void WarpSoftmaxBackward(T* dst,
   AccT sum_tmp[kBatchSize][kLoopsV][kVSize];
   AccT* gradptr = reinterpret_cast<AccT*>(&grad_tmp[0][0][0]);
   AccT* srcptr = reinterpret_cast<AccT*>(&src_tmp[0][0][0]);
-  kps::ElementwiseBinary<AccT, AccT, kStep, 1, 1, kps::MulFunctor<AccT>>(
-      &sum_tmp[0][0][0], &gradptr[0], &srcptr[0], kps::MulFunctor<AccT>());
-  kps::Reduce<AccT,
-              kVItem,
-              kBatchSize,
-              1,
-              kps::AddFunctor<AccT>,
-              kps::details::ReduceMode::kLocalMode>(
-      &sum[0], &sum_tmp[0][0][0], kps::AddFunctor<AccT>(), true);
+  if (LogMode) {
+    kps::Reduce<AccT,
+                kVItem,
+                kBatchSize,
+                1,
+                kps::AddFunctor<AccT>,
+                kps::details::ReduceMode::kLocalMode>(
+        &sum[0], &grad_tmp[0][0][0], kps::AddFunctor<AccT>(), true);
+  } else {
+    kps::ElementwiseBinary<AccT, AccT, kStep, 1, 1, kps::MulFunctor<AccT>>(
+        &sum_tmp[0][0][0], &gradptr[0], &srcptr[0], kps::MulFunctor<AccT>());
+    kps::Reduce<AccT,
+                kVItem,
+                kBatchSize,
+                1,
+                kps::AddFunctor<AccT>,
+                kps::details::ReduceMode::kLocalMode>(
+        &sum[0], &sum_tmp[0][0][0], kps::AddFunctor<AccT>(), true);
+  }
   WarpReduceSum<AccT, kBatchSize, kWarpSize>(sum);
 
   // write result to global memory
@@ -453,10 +472,23 @@ __global__ void WarpSoftmaxBackward(T* dst,
     if (i >= local_batches) break;
     AccT* gradptr = reinterpret_cast<AccT*>(&grad_tmp[i][0][0]);
     AccT* srcptr = reinterpret_cast<AccT*>(&src_tmp[i][0][0]);
-    kps::ElementwiseUnary<AccT, AccT, kVItem, 1, 1, UnarySubFunctor<AccT>>(
-        &out[i][0][0], &gradptr[0], UnarySubFunctor<AccT>(sum[i]));
-    kps::ElementwiseBinary<AccT, T, kVItem, 1, 1, kps::MulFunctor<AccT>>(
-        &out_tmp[i][0][0], &srcptr[0], &out[i][0][0], kps::MulFunctor<AccT>());
+    if (LogMode) {
+      kps::ElementwiseUnary<AccT, AccT, kVItem, 1, 1, ExpMulFunctor<AccT>>(
+          &out[i][0][0], &srcptr[0], ExpMulFunctor<AccT>(sum[i]));
+      kps::ElementwiseBinary<AccT, T, kVItem, 1, 1, kps::SubFunctor<AccT>>(
+          &out_tmp[i][0][0],
+          &gradptr[0],
+          &out[i][0][0],
+          kps::SubFunctor<AccT>());
+    } else {
+      kps::ElementwiseUnary<AccT, AccT, kVItem, 1, 1, UnarySubFunctor<AccT>>(
+          &out[i][0][0], &gradptr[0], UnarySubFunctor<AccT>(sum[i]));
+      kps::ElementwiseBinary<AccT, T, kVItem, 1, 1, kps::MulFunctor<AccT>>(
+          &out_tmp[i][0][0],
+          &srcptr[0],
+          &out[i][0][0],
+          kps::MulFunctor<AccT>());
+    }
     VecT* dst_v = reinterpret_cast<VecT*>(&dst[(first_batch + i) * stride]);
     VecT* reg_v = reinterpret_cast<VecT*>(&out_tmp[i][0][0]);
     kps::WriteData<VecT, VecT, kLoopsV, 1, 1, true>(
@@ -639,7 +671,8 @@ __global__ void NormalSoftmaxForward(
 
 template <typename T,
           typename AccT,
-          template <typename, typename> class Functor>
+          template <typename, typename> class Functor,
+          bool LogMode>
 __global__ void NormalSoftmaxBackward(T* input_grad,
                                       const T* output_grad,
                                       const T* output,
@@ -656,10 +689,17 @@ __global__ void NormalSoftmaxBackward(T* input_grad,
 
       // 1. reduce sum
       AccT sum = 0;
-      for (int mid_id = threadIdx.y; mid_id < mid_dim; mid_id += blockDim.y) {
-        int data_offset = grad_offset + mid_id * mid_stride;
-        sum += static_cast<AccT>(output_grad[data_offset]) *
-               static_cast<AccT>(output[data_offset]);
+      if (LogMode) {
+        for (int mid_id = threadIdx.y; mid_id < mid_dim; mid_id += blockDim.y) {
+          int data_offset = grad_offset + mid_id * mid_stride;
+          sum += static_cast<AccT>(output_grad[data_offset]);
+        }
+      } else {
+        for (int mid_id = threadIdx.y; mid_id < mid_dim; mid_id += blockDim.y) {
+          int data_offset = grad_offset + mid_id * mid_stride;
+          sum += static_cast<AccT>(output_grad[data_offset]) *
+                 static_cast<AccT>(output[data_offset]);
+        }
       }
       if (blockDim.y > 1) {
         kps::Reduce<AccT, 1, 1, 1, kps::AddFunctor<AccT>, kMode::kGlobalMode>(
@@ -715,21 +755,21 @@ void LaunchNormalSoftmaxBackward(const GPUContext& dev_ctx,
   dim3 grid, block;
   GetLaunchConfig(high_dim, mid_dim, low_dim, &grid, &block);
   if (LogMode) {
-    NormalSoftmaxBackward<
-        T,
-        AccT,
-        LogSoftmaxBackwardFunctor><<<grid, block, 0, dev_ctx.stream()>>>(
+    NormalSoftmaxBackward<T,
+                          AccT,
+                          LogSoftmaxBackwardFunctor,
+                          LogMode><<<grid, block, 0, dev_ctx.stream()>>>(
         input_grad_data,
         output_grad_data,
         output_data,
         high_dim,
         mid_dim,
         low_dim);
   } else {
-    NormalSoftmaxBackward<
-        T,
-        AccT,
-        SoftmaxBackwardFunctor><<<grid, block, 0, dev_ctx.stream()>>>(
+    NormalSoftmaxBackward<T,
+                          AccT,
+                          SoftmaxBackwardFunctor,
+                          LogMode><<<grid, block, 0, dev_ctx.stream()>>>(
         input_grad_data,
         output_grad_data,
         output_data,

diff --git a/python/paddle/nn/functional/activation.py b/python/paddle/nn/functional/activation.py
@@ -1359,11 +1359,12 @@ def log_softmax(x, axis=-1, dtype=None, name=None):
 
     if (dtype is not None) and (not isinstance(dtype, core.VarDesc.VarType)):
         dtype = convert_np_dtype_to_dtype_(dtype)
+    use_cudnn = True
 
     if in_dynamic_mode():
         if dtype is not None:
             x = _C_ops.cast(x, 'in_dtype', x.dtype, 'out_dtype', dtype)
-        return _C_ops.log_softmax(x, 'axis', axis)
+        return _C_ops.log_softmax(x, 'axis', axis, 'use_cudnn', use_cudnn)
 
     if dtype is None:
         check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'],
@@ -1388,7 +1389,8 @@ def log_softmax(x, axis=-1, dtype=None, name=None):
         type='log_softmax',
         inputs={'X': out_cast},
         outputs={'Out': out},
-        attrs={'axis': axis})
+        attrs={'axis': axis,
+               'use_cudnn': use_cudnn})
 
     return out