Merge pull request #22 from songyuwen0808/paddlebox

conv相关代码合入
jack603047588 · Dec 30, 2021 · eecb4d4 · eecb4d4
2 parents 6001463 + b97d39b
commit eecb4d4
Show file tree

Hide file tree

Showing 8 changed files with 825 additions and 19 deletions.
diff --git a/cmake/external/box_ps.cmake b/cmake/external/box_ps.cmake
@@ -20,7 +20,7 @@ IF((NOT DEFINED BOX_PS_VER) OR (NOT DEFINED BOX_PS_URL))
   SET(BOX_PS_VER "0.1.1" CACHE STRING "" FORCE)
   SET(BOX_PS_NAME "box_ps" CACHE STRING "" FORCE)
   #SET(BOX_PS_URL "http://box-ps.gz.bcebos.com/box_ps.tar.gz" CACHE STRING "" FORCE)
-  SET(BOX_PS_URL "data-im.baidu.com:/home/work/var/CI_DATA/im/static/box_ps.tar.gz/box_ps.tar.gz.20" CACHE STRING "" FORCE)
+  SET(BOX_PS_URL "data-im.baidu.com:/home/work/var/CI_DATA/im/static/box_ps.tar.gz/box_ps.tar.gz.30" CACHE STRING "" FORCE)
 ENDIF()
 MESSAGE(STATUS "BOX_PS_NAME: ${BOX_PS_NAME}, BOX_PS_URL: ${BOX_PS_URL}")
 SET(BOX_PS_SOURCE_DIR    "${THIRD_PARTY_PATH}/box_ps")

diff --git a/paddle/fluid/framework/fleet/box_wrapper.cc b/paddle/fluid/framework/fleet/box_wrapper.cc
@@ -424,6 +424,9 @@ void BoxWrapper::PullSparse(const paddle::platform::Place& place,
                feature_type_ == static_cast<int>(boxps::FEATURE_SHOWCLK)) {  \
       PullSparseCase<boxps::FeaturePullValueGpuQuant<EmbedxDim, ExpandDim>>( \
           place, keys, values, slot_lengths, hidden_size, expand_embed_dim); \
+    } else if (feature_type_ == static_cast<int>(boxps::FEATURE_CONV)) {  \
+      PullSparseCase<boxps::FeaturePullValueGpuConv<EmbedxDim, ExpandDim>>( \
+          place, keys, values, slot_lengths, hidden_size, expand_embed_dim); \
     } else if (feature_type_ == static_cast<int>(boxps::FEATURE_VARIABLE)) { \
       PullSparseCase<boxps::FeatureVarPullValueGpu<EmbedxDim, ExpandDim>>(   \
           place, keys, values, slot_lengths, hidden_size, expand_embed_dim); \
@@ -475,28 +478,33 @@ void BoxWrapper::PushSparseGrad(const paddle::platform::Place& place,
     }                                                                        \
   } break
 
-#define PUSHSPARSE_CASE(i, ...)                                                \
-  case i: {                                                                    \
-    constexpr size_t ExpandDim = i;                                            \
-    if (feature_type_ == static_cast<int>(boxps::FEATURE_SHARE_EMBEDDING)) {   \
-      PushSparseGradCase<                                                      \
-          boxps::FeaturePushValueGpuShareEmbedding<EmbedxDim, ExpandDim>>(     \
-          place, keys, grad_values, slot_lengths, hidden_size,                 \
-          expand_embed_dim, batch_size);                                       \
-    } else if (feature_type_ == static_cast<int>(boxps::FEATURE_PCOC)) {       \
-      PushSparseGradCase<                                                      \
-          boxps::FeaturePushValueGpuPCOC<EmbedxDim, ExpandDim>>(               \
-          place, keys, grad_values, slot_lengths, hidden_size,                 \
-          expand_embed_dim, batch_size);                                       \
+#define PUSHSPARSE_CASE(i, ...)                                              \
+  case i: {                                                                  \
+    constexpr size_t ExpandDim = i;                                          \
+    if (feature_type_ == static_cast<int>(boxps::FEATURE_SHARE_EMBEDDING)) { \
+      PushSparseGradCase<                                                    \
+          boxps::FeaturePushValueGpuShareEmbedding<EmbedxDim, ExpandDim>>(   \
+          place, keys, grad_values, slot_lengths, hidden_size,               \
+          expand_embed_dim, batch_size);                                     \
+    } else if (feature_type_ == static_cast<int>(boxps::FEATURE_PCOC)) {     \
+      PushSparseGradCase<                                                    \
+          boxps::FeaturePushValueGpuPCOC<EmbedxDim, ExpandDim>>(             \
+          place, keys, grad_values, slot_lengths, hidden_size,               \
+          expand_embed_dim, batch_size);                                     \
     } else if (feature_type_ == static_cast<int>(boxps::FEATURE_VARIABLE)) {   \
       PushSparseGradCase<boxps::FeatureVarPushValueGpu<EmbedxDim, ExpandDim>>( \
           place, keys, grad_values, slot_lengths, hidden_size,                 \
           expand_embed_dim, batch_size);                                       \
-    } else {                                                                   \
-      PushSparseGradCase<boxps::FeaturePushValueGpu<EmbedxDim, ExpandDim>>(    \
-          place, keys, grad_values, slot_lengths, hidden_size,                 \
-          expand_embed_dim, batch_size);                                       \
-    }                                                                          \
+    } else if (feature_type_ == static_cast<int>(boxps::FEATURE_CONV)) {     \
+      PushSparseGradCase<                                                    \
+          boxps::FeaturePushValueGpuConv<EmbedxDim, ExpandDim>>(             \
+          place, keys, grad_values, slot_lengths, hidden_size,               \
+          expand_embed_dim, batch_size);                                     \
+    } else {                                                                 \
+      PushSparseGradCase<boxps::FeaturePushValueGpu<EmbedxDim, ExpandDim>>(  \
+          place, keys, grad_values, slot_lengths, hidden_size,               \
+          expand_embed_dim, batch_size);                                     \
+    }                                                                        \
   } break
 
   CheckEmbedSizeIsValid(hidden_size - cvm_offset_, expand_embed_dim);

diff --git a/paddle/fluid/framework/fleet/box_wrapper.cu b/paddle/fluid/framework/fleet/box_wrapper.cu
@@ -1189,6 +1189,11 @@ void BoxWrapper::CopyForPull(const paddle::platform::Place& place,
           stream, gpu_keys, gpu_values, total_values_gpu, hidden_size,        \
           EmbedxDim, total_length, total_dims, slot_lens, slot_num, key2slot, \
           pull_embedx_scale_, cvm_offset_, gpu_restore_idx);                  \
+    } else if (feature_type_ == static_cast<int>(boxps::FEATURE_CONV)) {   \
+      FeaturePullCopy<boxps::FeaturePullValueGpuConv<EmbedxDim, ExpandDim>>( \
+          stream, gpu_keys, gpu_values, total_values_gpu, hidden_size,        \
+          EmbedxDim, total_length, total_dims, slot_lens, slot_num, key2slot, \
+          pull_embedx_scale_, cvm_offset_, gpu_restore_idx);                  \
     } else {                                                                  \
       FeaturePullCopy<boxps::FeaturePullValueGpu<EmbedxDim, ExpandDim>>(      \
           stream, gpu_keys, gpu_values, total_values_gpu, hidden_size,        \
@@ -1219,6 +1224,12 @@ void BoxWrapper::CopyForPull(const paddle::platform::Place& place,
           stream, gpu_keys, gpu_values, total_values_gpu, hidden_size,         \
           EmbedxDim, ExpandDim, total_length, total_dims, slot_lens, slot_num, \
           key2slot, 1.0, cvm_offset_, gpu_restore_idx);                        \
+    } else if (feature_type_ == static_cast<int>(boxps::FEATURE_CONV)) {      \
+      FeaturePullCopyNNCross<                                                 \
+          boxps::FeaturePullValueGpuConv<EmbedxDim, ExpandDim>>(                \
+          stream, gpu_keys, gpu_values, total_values_gpu, hidden_size,         \
+          EmbedxDim, ExpandDim, total_length, total_dims, slot_lens, slot_num, \
+          key2slot, 1.0, cvm_offset_, gpu_restore_idx);                        \
     } else {                                                                   \
       FeaturePullCopyNNCross<                                                  \
           boxps::FeaturePullValueGpu<EmbedxDim, ExpandDim>>(                   \
@@ -1479,6 +1490,12 @@ void BoxWrapper::CopyForPush(const paddle::platform::Place& place,
           total_length, batch_size, d_slot_vector, total_dims, slot_lens,     \
           slot_num, key2slot, cvm_offset_, gpu_sort_idx, gpu_sort_offset,     \
           gpu_sort_lens);                                                     \
+    } else if (feature_type_ == static_cast<int>(boxps::FEATURE_CONV)) {      \
+      FeaturePushCopy<boxps::FeaturePushValueGpuConv<EmbedxDim, ExpandDim>>(  \
+          stream, total_grad_values_gpu, grad_values, hidden_size, EmbedxDim, \
+          total_length, batch_size, d_slot_vector, total_dims, slot_lens,     \
+          slot_num, key2slot, cvm_offset_, gpu_sort_idx, gpu_sort_offset,     \
+          gpu_sort_lens);                                                     \
     } else {                                                                  \
       FeaturePushCopy<boxps::FeaturePushValueGpu<EmbedxDim, ExpandDim>>(      \
           stream, total_grad_values_gpu, grad_values, hidden_size, EmbedxDim, \
@@ -1505,6 +1522,13 @@ void BoxWrapper::CopyForPush(const paddle::platform::Place& place,
           ExpandDim, total_length, batch_size, d_slot_vector, total_dims,     \
           slot_lens, slot_num, key2slot, cvm_offset_, gpu_sort_idx,           \
           gpu_sort_offset, gpu_sort_lens);                                    \
+    } else if (feature_type_ == static_cast<int>(boxps::FEATURE_CONV)) {  \
+      FeaturePushCopyVariable<                                                \
+          boxps::FeaturePushValueGpuConv<EmbedxDim, ExpandDim>>(               \
+          stream, total_grad_values_gpu, grad_values, hidden_size, EmbedxDim, \
+          ExpandDim, total_length, batch_size, d_slot_vector, total_dims,     \
+          slot_lens, slot_num, key2slot, cvm_offset_, gpu_sort_idx,           \
+          gpu_sort_offset, gpu_sort_lens);                                    \
     } else {                                                                  \
       FeaturePushCopyNNCross<                                                 \
           boxps::FeaturePushValueGpu<EmbedxDim, ExpandDim>>(                  \

diff --git a/paddle/fluid/framework/fleet/box_wrapper.h b/paddle/fluid/framework/fleet/box_wrapper.h
@@ -579,6 +579,8 @@ class BoxWrapper {
       } else if (s_instance_->feature_type_ ==
                  static_cast<int>(boxps::FEATURE_PCOC)) {
         s_instance_->cvm_offset_ = 8;
+      } else if (s_instance_->feature_type_ == static_cast<int>(boxps::FEATURE_CONV)) {
+        s_instance_->cvm_offset_ = 4;
       } else {
         s_instance_->cvm_offset_ = 3;
       }

diff --git a/paddle/fluid/operators/fused/fused_seqpool_cvm_with_conv_op.cc b/paddle/fluid/operators/fused/fused_seqpool_cvm_with_conv_op.cc
@@ -0,0 +1,220 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/fused/fused_seqpool_cvm_with_conv_op.h"
+#include <string>
+namespace paddle {
+namespace operators {
+
+class FusedSeqpoolCVMOpWithConv : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE_GE(ctx->Inputs("X").size(), 1UL, "Inputs(X) of FusedSeqpoolCVMOpWithConv should not be empty.");
+    PADDLE_ENFORCE_GE(ctx->Outputs("Out").size(), 1UL, "Outputs(Out) of FusedSeqpoolCVMOpWithConv should not be empty.");
+
+    auto cvm_dims = ctx->GetInputDim("CVM");
+    PADDLE_ENFORCE_EQ(cvm_dims.size(), 2UL, platform::errors::InvalidArgument("Input(CVM)'s rank should be 2."));
+    PADDLE_ENFORCE_EQ(cvm_dims[1], 3UL,
+        platform::errors::InvalidArgument("The 2nd dimension of Input(CVM) should be 3."));
+
+    auto ins_dims = ctx->GetInputsDim("X");
+    const int cvm_offset = ctx->Attrs().Get<int>("cvm_offset");
+    const size_t num_inputs = ins_dims.size();
+    std::vector<framework::DDim> outs_dims;
+    outs_dims.resize(num_inputs);
+    bool use_cvm = ctx->Attrs().Get<bool>("use_cvm");
+    bool show_filter = ctx->Attrs().Get<bool>("show_filter");
+
+    PADDLE_ENFORCE_GT(num_inputs, 0UL,
+                      platform::errors::InvalidArgument(
+                          "Input tensors count should be greater than 0, "
+                          "but received value is %d.",
+                          num_inputs));
+
+    // The output height should be confirmed in Compute,
+    // since input lod is not accessible here.
+    PADDLE_ENFORCE_EQ(ins_dims[0].size(), 2,
+                      platform::errors::InvalidArgument(
+                          "The dims size of first input should be equal to 2, "
+                          "but received value is %d.",
+                          ins_dims[0].size()));
+
+    for (size_t i = 0; i < num_inputs; ++i) {
+      const auto dims = ins_dims[i];
+      int rank = dims.size();
+      if (use_cvm) {
+        PADDLE_ENFORCE_GT(
+            dims[rank - 1], 2,
+            "Shape error in %lu id, the last dimension(embedding) of the "
+            "'X' tensor must be larger than 2.",
+            i);
+      }
+      // input lod is not accessible here
+      std::vector<int64_t> out_dim;
+      if (use_cvm) {
+        if (show_filter) {
+          out_dim = {-1, dims[rank - 1] - 1};
+        } else {
+          out_dim = {-1, dims[rank - 1]};
+        }
+      } else {
+        out_dim = {-1, dims[rank - 1] - cvm_offset};
+      }
+      outs_dims[i] = framework::make_ddim(out_dim);
+    }
+    ctx->SetOutputsDim("Out", outs_dims);
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(framework::proto::VarType::FP32,
+                                   ctx.device_context());
+  }
+};
+
+class FusedSeqpoolCVMOpWithConvMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X",
+             "(vector<LoDTensor>) The input tensors of"
+             " operator.")
+        .AsDuplicable();
+    AddInput("CVM",
+             "(Tensor),  a 2-D Tensor with shape [N x 2], where N is the batch "
+             "size, 2 is show and click.");
+    AddOutput("Out",
+              "(vector<Tensor>) The output of Op does not contain LoD "
+              "information.")
+        .AsDuplicable();
+    AddAttr<std::string>("pooltype",
+                         "(string, default 'SUM') the pooling pooltype of "
+                         "SequencePoolOp, only support SUM now.")
+        .SetDefault("SUM")
+        .InEnum({"SUM"});
+    AddAttr<float>("pad_value",
+                   "(float, default 0.0) The value to pad for empty sequence.")
+        .SetDefault(0.0);
+    AddAttr<bool>("use_cvm", "bool, use cvm or not").SetDefault(true);
+    AddAttr<int>("cvm_offset", "(int, default 3)").SetDefault(3);
+    AddAttr<bool>("show_filter", "(bool, default false)").SetDefault(false);
+
+    AddComment(R"DOC(
+Fuse multiple pairs of Sequence Pool and CVM Operator.
+
+)DOC");
+  }
+};
+
+class FusedSeqpoolCVMGradOpWithConv : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    auto og_dims = ctx->GetInputsDim(framework::GradVarName("Out"));
+    auto x_dims = ctx->GetInputsDim("X");
+    auto cvm_dims = ctx->GetInputDim("CVM");
+    const int cvm_offset = ctx->Attrs().Get<int>("cvm_offset");
+    bool use_cvm = ctx->Attrs().Get<bool>("use_cvm");
+    bool show_filter = ctx->Attrs().Get<bool>("show_filter");
+
+    PADDLE_ENFORCE_EQ(
+        cvm_dims.size(), 2,
+        platform::errors::InvalidArgument("Input(CVM)'s rank should be 2."));
+
+    for (size_t i = 0; i < og_dims.size(); i++) {
+      PADDLE_ENFORCE_EQ(
+          og_dims[i].size(), x_dims[i].size(),
+          platform::errors::InvalidArgument(
+              "The rank of output grad must equal to Input(X). But "
+              "received: input rank %u, input shape [%s].",
+              og_dims[i].size(), og_dims[i]));
+      if (use_cvm) {
+        auto o_dim = og_dims[i][og_dims[i].size() - 1];
+        if (show_filter) {
+            o_dim += 1;
+        }
+        PADDLE_ENFORCE_EQ(
+            o_dim, x_dims[i][og_dims[i].size() - 1],
+            platform::errors::InvalidArgument(
+                "The dimension mismatch between Input(OUT@GRAD) and "
+                "Input(X). Received Input(OUT@GRAD): input rank %u, "
+                "input shape [%s]; received Input(X): input rank %u, "
+                "input shape [%s].",
+                og_dims[i].size(), og_dims[i], x_dims[i].size(), x_dims[i]));
+      } else {
+        PADDLE_ENFORCE_EQ(
+            og_dims[i][og_dims[i].size() - 1],
+            x_dims[i][og_dims[i].size() - 1] - cvm_offset,
+            platform::errors::InvalidArgument(
+                "The dimension mismatch between Input(OUT@GRAD) and "
+                "Input(X). Received Input(OUT@GRAD): input rank %u, "
+                "input shape [%s]; received Input(X): input rank %u, "
+                "input shape [%s].",
+                og_dims[i].size(), og_dims[i], x_dims[i].size(), x_dims[i]));
+      }
+    }
+    for (size_t i = 0; i < x_dims.size(); ++i) {
+      ctx->ShareLoD("X", framework::GradVarName("X"), i, i);
+      ctx->ShareDim("X", framework::GradVarName("X"), i, i);
+    }
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(OperatorWithKernel::IndicateVarDataType(
+                                       ctx, framework::GradVarName("Out")),
+                                   ctx.device_context());
+  }
+};
+
+template <typename T>
+class FusedSeqpoolCVMGradOpWithConvMaker : public framework::SingleGradOpMaker<T> {
+ public:
+  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
+
+ protected:
+  void Apply(GradOpPtr<T> op_desc_ptr) const override {
+    op_desc_ptr->SetType("fused_seqpool_cvm_with_conv_grad");
+    op_desc_ptr->SetInput("X", this->Input("X"));
+    op_desc_ptr->SetInput("CVM", this->Input("CVM"));
+
+    op_desc_ptr->SetInput(framework::GradVarName("Out"),
+                          this->OutputGrad("Out"));
+    op_desc_ptr->SetOutput(framework::GradVarName("X"),
+                           this->InputGrad("X", false));
+    op_desc_ptr->SetOutput(framework::GradVarName("CVM"),
+                           this->InputGrad("CVM"));
+    op_desc_ptr->SetAttrMap(this->Attrs());
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OPERATOR(fused_seqpool_cvm_with_conv, ops::FusedSeqpoolCVMOpWithConv,
+                  ops::FusedSeqpoolCVMOpWithConvMaker,
+                  ops::FusedSeqpoolCVMGradOpWithConvMaker<paddle::framework::OpDesc>,
+                  ops::FusedSeqpoolCVMGradOpWithConvMaker<paddle::imperative::OpBase>);
+REGISTER_OPERATOR(fused_seqpool_cvm_with_conv_grad, ops::FusedSeqpoolCVMGradOpWithConv)
+
+REGISTER_OP_CPU_KERNEL(fused_seqpool_cvm_with_conv,
+                       ops::FusedSeqpoolCVMOpWithConvCPUKernel<float>)
+REGISTER_OP_CPU_KERNEL(fused_seqpool_cvm_with_conv_grad,
+                       ops::FusedSeqpoolCVMGradOpWithConvCPUKernel<float>)