Skip to content

Commit

Permalink
Merge branch 'develop' of github.com:YuanRisheng/Paddle into add_raw_…
Browse files Browse the repository at this point in the history
…kernel
  • Loading branch information
YuanRisheng committed Jan 25, 2022
2 parents 365f60a + 529f142 commit 33bb053
Show file tree
Hide file tree
Showing 90 changed files with 3,072 additions and 1,851 deletions.
5 changes: 3 additions & 2 deletions cmake/inference_lib.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -189,6 +189,7 @@ copy(inference_lib_dist
copy_part_of_thrid_party(inference_lib_dist ${PADDLE_INFERENCE_INSTALL_DIR})

set(src_dir "${PADDLE_SOURCE_DIR}/paddle/fluid")

if(WIN32)
if(WITH_STATIC_LIB)
set(paddle_inference_lib $<TARGET_FILE_DIR:paddle_inference>/libpaddle_inference.lib
Expand Down Expand Up @@ -304,7 +305,7 @@ copy(fluid_lib_dist
)

set(module "platform")
set(platform_lib_deps profiler_proto error_codes_proto)
set(platform_lib_deps profiler_proto errors)
if(WITH_GPU)
set(platform_lib_deps ${platform_lib_deps} external_error_proto)
endif(WITH_GPU)
Expand All @@ -317,7 +318,7 @@ copy(fluid_lib_dist

set(module "string")
copy(fluid_lib_dist
SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/tinyformat/*.h
SRCS ${PADDLE_SOURCE_DIR}/paddle/utils/${module}/*.h ${PADDLE_SOURCE_DIR}/paddle/utils/${module}/tinyformat/*.h
DSTS ${dst_dir}/${module} ${dst_dir}/${module}/tinyformat
)

Expand Down
1 change: 1 addition & 0 deletions paddle/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
add_subdirectory(utils)
add_subdirectory(scripts)
add_subdirectory(testing)
set(PYTHON_TESTS_DIR ${PADDLE_BINARY_DIR}/python/paddle/fluid/tests CACHE INTERNAL "python tests directory")
Expand Down
1 change: 0 additions & 1 deletion paddle/fluid/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@ add_subdirectory(distributed)
add_subdirectory(framework)
add_subdirectory(imperative)
add_subdirectory(operators)
add_subdirectory(string)
add_subdirectory(pybind)
add_subdirectory(eager)
# NOTE: please add subdirectory inference at last.
Expand Down
12 changes: 8 additions & 4 deletions paddle/fluid/framework/expect.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,14 +19,18 @@
#define _LINUX
#endif

#ifdef _LINUX
#ifndef likely
#define likely(x) __builtin_expect((x), 1)
#ifdef _LINUX
#define likely(expr) (__builtin_expect(!!(expr), 1))
#else
#define likely(expr) (expr)
#endif
#endif

#ifdef _LINUX
#ifndef unlikely
#define unlikely(x) __builtin_expect((x), 0)
#ifdef _LINUX
#define unlikely(expr) (__builtin_expect(!!(expr), 0))
#else
#define unlikely(expr) (expr)
#endif
#endif
8 changes: 4 additions & 4 deletions paddle/fluid/framework/ir/graph_helper.cc
Original file line number Diff line number Diff line change
Expand Up @@ -603,9 +603,9 @@ static std::vector<std::vector<ir::Node::Dep>> GetOpDependencies(
for (const auto *op_desc : block_ops) {
size_t op_idx = op_id_to_idx.size();
PADDLE_ENFORCE_EQ(
op_id_to_idx.emplace(op_desc->Id(), op_idx).second, true,
op_id_to_idx.emplace(op_desc->OriginalId(), op_idx).second, true,
platform::errors::InvalidArgument(
"There should not be duplicate op id: %d", op_desc->Id()));
"There should not be duplicate op id: %d", op_desc->OriginalId()));
}

std::vector<std::vector<ir::Node::Dep>> dep_matrix(op_num);
Expand All @@ -624,9 +624,9 @@ static std::vector<std::vector<ir::Node::Dep>> GetOpDependencies(

for (const auto &pair : all_preceding_ops) {
const auto *cur_op_node = pair.first;
size_t op_idx_1 = get_op_idx_by_id(cur_op_node->Op()->Id());
size_t op_idx_1 = get_op_idx_by_id(cur_op_node->Op()->OriginalId());
for (const auto *preceding_op_node : pair.second) {
size_t op_idx_2 = get_op_idx_by_id(preceding_op_node->Op()->Id());
size_t op_idx_2 = get_op_idx_by_id(preceding_op_node->Op()->OriginalId());
dep_matrix[op_idx_1][op_idx_2] = ir::Node::Dep::kAfter;
dep_matrix[op_idx_2][op_idx_1] = ir::Node::Dep::kBefore;
}
Expand Down
6 changes: 0 additions & 6 deletions paddle/fluid/framework/op_call_stack.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,12 +19,6 @@ limitations under the License. */
#include "paddle/fluid/framework/type_defs.h"
#include "paddle/fluid/platform/enforce.h"

namespace paddle {
namespace platform {
struct EnforceNotMet;
} // namespace platform
} // namespace paddle

namespace paddle {
namespace framework {

Expand Down
6 changes: 0 additions & 6 deletions paddle/fluid/framework/op_proto_maker_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -18,12 +18,6 @@ limitations under the License. */
#include "gtest/gtest-test-part.h"
#include "gtest/gtest.h"

namespace paddle {
namespace platform {
struct EnforceNotMet;
} // namespace platform
} // namespace paddle

class TestAttrProtoMaker : public paddle::framework::OpProtoAndCheckerMaker {
public:
void Make() {
Expand Down
10 changes: 1 addition & 9 deletions paddle/fluid/framework/type_defs.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,8 @@ limitations under the License. */
#include <unordered_map>
#include <unordered_set>
#include <vector>
#include "paddle/fluid/imperative/type_defs.h"
#include "paddle/fluid/platform/variant.h"
#include "paddle/pten/core/type_defs.h"
#include "paddle/utils/small_vector.h"

namespace paddle {
Expand All @@ -39,14 +39,6 @@ class InferNoNeedBufferVarsFN;
using VariableNameMap = std::map<std::string, std::vector<std::string>>;
using VariableValueMap = std::map<std::string, std::vector<Variable*>>;

// The order should be as same as framework.proto
using Attribute = boost::variant<
boost::blank, int, float, std::string, std::vector<int>, std::vector<float>,
std::vector<std::string>, bool, std::vector<bool>, BlockDesc*, int64_t,
std::vector<BlockDesc*>, std::vector<int64_t>, std::vector<double>>;

using AttributeMap = std::unordered_map<std::string, Attribute>;

#ifdef PADDLE_WITH_ASCEND_CL
using NPUAttribute =
boost::variant<boost::blank, int, float, std::string, std::vector<int>,
Expand Down
4 changes: 4 additions & 0 deletions paddle/fluid/imperative/prepared_operator.cc
Original file line number Diff line number Diff line change
Expand Up @@ -369,6 +369,10 @@ static void BuildDygraphPtenKernelContext(
size_t end_idx = start_idx + outs_vector.size();

for (size_t offset = 0; offset < outs_vector.size(); ++offset) {
if (outs_vector[offset] == nullptr) {
kernel_ctx->EmplaceBackOutputWithoutSetRange({nullptr});
continue;
}
auto* var = outs_vector[offset]->MutableVar();
framework::Tensor* tensor_out = nullptr;
if (var->template IsType<framework::LoDTensor>()) {
Expand Down
45 changes: 1 addition & 44 deletions paddle/fluid/imperative/type_defs.h
Original file line number Diff line number Diff line change
Expand Up @@ -13,47 +13,4 @@ See the License for the specific language governing permissions and
limitations under the License. */

#pragma once

#include <map>
#include <memory>
#include <string>
#include <vector>

namespace paddle {
namespace imperative {

class VariableWrapper;
class SavedVariableWrapperList;
class VarBase;
class OpBase;
class GradOpNode;
class Tracer;

using WeakNameVarBaseMap =
std::map<std::string, std::vector<std::weak_ptr<VarBase>>>;

namespace details {
template <typename T>
struct NameVarMapTrait {};

template <>
struct NameVarMapTrait<VarBase> {
using Type = std::map<std::string, std::vector<std::shared_ptr<VarBase>>>;
};

template <>
struct NameVarMapTrait<VariableWrapper> {
using Type = std::map<std::string, SavedVariableWrapperList>;
};
} // namespace details

template <typename T>
using NameVarMap = typename details::NameVarMapTrait<T>::Type;

using NameVarBaseMap = NameVarMap<VarBase>;
using NameVariableWrapperMap = NameVarMap<VariableWrapper>;

using VariableWrapperList = std::vector<std::shared_ptr<VariableWrapper>>;

} // namespace imperative
} // namespace paddle
#include "paddle/pten/core/type_defs.h"
5 changes: 3 additions & 2 deletions paddle/fluid/inference/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ endif()
# fluid_modules exclude API-interface of inference/api and inference/capi_exp
get_property(fluid_modules GLOBAL PROPERTY FLUID_MODULES)
get_property(pten_modules GLOBAL PROPERTY PTEN_MODULES)
set(utils_modules stringpiece pretty_log string_helper)

add_subdirectory(api)

Expand All @@ -46,9 +47,9 @@ set(STATIC_INFERENCE_API paddle_inference_api analysis_predictor
analysis_config paddle_pass_builder activation_functions ${mkldnn_quantizer_cfg})
#TODO(wilber, T8T9): Do we still need to support windows gpu static library?
if(WIN32 AND WITH_GPU)
cc_library(paddle_inference DEPS ${fluid_modules} ${pten_modules} ${STATIC_INFERENCE_API})
cc_library(paddle_inference DEPS ${fluid_modules} ${pten_modules} ${STATIC_INFERENCE_API} ${utils_modules})
else()
create_static_lib(paddle_inference ${fluid_modules} ${pten_modules} ${STATIC_INFERENCE_API})
create_static_lib(paddle_inference ${fluid_modules} ${pten_modules} ${STATIC_INFERENCE_API} ${utils_modules})
if(WITH_IPU)
target_link_libraries(paddle_inference -Wl,--allow-multiple-definition popart_canonicalization_utils)
endif()
Expand Down
28 changes: 0 additions & 28 deletions paddle/fluid/operators/elementwise/elementwise_add_op.cc
Original file line number Diff line number Diff line change
Expand Up @@ -33,34 +33,6 @@ class CPUDeviceContext;
namespace paddle {
namespace operators {

template <typename T>
struct SameDimsElemwiseAdd<
platform::CPUDeviceContext, T,
typename std::enable_if<std::is_floating_point<T>::value>::type> {
void operator()(const framework::ExecutionContext &ctx,
const framework::Tensor *x, const framework::Tensor *y,
framework::Tensor *z) {
auto blas = math::GetBlas<platform::CPUDeviceContext, T>(ctx);
blas.VADD(x->numel(), x->data<T>(), y->data<T>(), z->data<T>());
}
};

template <typename T>
struct SameDimsElemwiseAdd<
platform::CPUDeviceContext, T,
typename std::enable_if<!std::is_floating_point<T>::value>::type> {
void operator()(const framework::ExecutionContext &ctx,
const framework::Tensor *x, const framework::Tensor *y,
framework::Tensor *z) {
auto eigen_x = framework::EigenVector<T>::Flatten(*x);
auto eigen_y = framework::EigenVector<T>::Flatten(*y);
auto eigen_z = framework::EigenVector<T>::Flatten(*z);
auto &place = *ctx.template device_context<platform::CPUDeviceContext>()
.eigen_device();
eigen_z.device(place) = eigen_x + eigen_y;
}
};

class ElementwiseAddOpMaker : public ElementwiseOpMaker {
protected:
std::string GetName() const override { return "Add"; }
Expand Down
130 changes: 2 additions & 128 deletions paddle/fluid/operators/elementwise/elementwise_add_op.cu
Original file line number Diff line number Diff line change
Expand Up @@ -13,139 +13,13 @@ See the License for the specific language governing permissions and
limitations under the License. */

#include "paddle/fluid/operators/elementwise/elementwise_add_op.h"
#include "paddle/pten/kernels/gpu/elementwise.h"

namespace ops = paddle::operators;
namespace plat = paddle::platform;

namespace paddle {
namespace operators {

template <typename T>
static __global__ void SimpleElemwiseAddGradCUDAKernel(
const T* __restrict__ dout, int size, int vec_size, T* dx, T* dy) {
int tid = blockIdx.x * blockDim.x + threadIdx.x;
int stride = gridDim.x * blockDim.x;
int loop = size / vec_size;
int remainder = size % vec_size;
const float4* dout_vec = reinterpret_cast<const float4*>(dout);
float4* dx_vec = reinterpret_cast<float4*>(dx);
float4* dy_vec = reinterpret_cast<float4*>(dy);
float4 tmp_loop;

for (int i = tid; i < loop; i += stride) {
tmp_loop = dout_vec[i];
dx_vec[i] = tmp_loop;
dy_vec[i] = tmp_loop;
}

if (tid == loop && remainder != 0) {
T tmp_rem;
while (remainder) {
int idx = size - remainder;
remainder--;
tmp_rem = dout[idx];
dx[idx] = tmp_rem;
dy[idx] = tmp_rem;
}
}
}

template <typename DeviceContext, typename T>
typename std::enable_if<
std::is_same<DeviceContext, platform::CUDADeviceContext>::value>::type
default_elementwise_add_grad(const framework::ExecutionContext& ctx,
const framework::Tensor* x,
const framework::Tensor* y,
const framework::Tensor* out,
const framework::Tensor* dout,
framework::Tensor* dx, framework::Tensor* dy) {
int axis = ctx.Attr<int>("axis");
auto* dout_data = dout->data<T>();

// dx
if (dx != nullptr) {
auto* dx_data = dx->mutable_data<T>(ctx.GetPlace());
if (dx->dims() == dout->dims()) {
if (dx_data != dout_data) {
framework::TensorCopy(
*dout, ctx.GetPlace(),
ctx.template device_context<platform::DeviceContext>(), dx);
}
} else {
// For inplace strategy, dx will be stored in addr of dout, which makes
// the result of dy wrong.
if (dx->IsSharedBufferWith(*dout)) {
dx->clear();
dx->mutable_data<T>(x->dims(), ctx.GetPlace());
}
std::vector<int> reduce_dims = GetReduceDim(x->dims(), out->dims(), axis);
gpuStream_t stream = ctx.cuda_device_context().stream();
TensorReduceFunctorImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
*dout, dx, kps::IdentityFunctor<T>(), reduce_dims, stream);
}
}
// dy
if (dy != nullptr) {
auto* dy_data = dy->mutable_data<T>(ctx.GetPlace());
if (dy->dims() == dout->dims()) {
if (dy_data != dout_data) {
framework::TensorCopy(
*dout, ctx.GetPlace(),
ctx.template device_context<platform::DeviceContext>(), dy);
}
} else {
std::vector<int> reduce_dims = GetReduceDim(y->dims(), out->dims(), axis);
gpuStream_t stream = ctx.cuda_device_context().stream();
TensorReduceFunctorImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
*dout, dy, kps::IdentityFunctor<T>(), reduce_dims, stream);
}
}
}

template <typename DeviceContext, typename T>
typename std::enable_if<
std::is_same<DeviceContext, plat::CUDADeviceContext>::value>::type
elementwise_add_grad(const framework::ExecutionContext& ctx,
const framework::Tensor* x, const framework::Tensor* y,
const framework::Tensor* out,
const framework::Tensor* dout, framework::Tensor* dx,
framework::Tensor* dy) {
auto* dx_data = dx->mutable_data<T>(ctx.GetPlace());
auto* dy_data = dy->mutable_data<T>(ctx.GetPlace());
auto* dout_data = dout->data<T>();
if (dx_data == dout_data && dy_data != dout_data) {
VLOG(4) << "Special case when dx_data is the same as dout_data, "
"only need copy dout to dy";
framework::TensorCopy(
*dout, ctx.GetPlace(),
ctx.template device_context<platform::DeviceContext>(), dy);
} else if (dx_data != dout_data && dy_data == dout_data) {
VLOG(4) << "Special case when dy_data is the same as dout_data, "
"only need copy dout to dx";
framework::TensorCopy(
*dout, ctx.GetPlace(),
ctx.template device_context<platform::DeviceContext>(), dx);
} else if (dx_data != dout_data && dy_data != dout_data) {
auto size = x->numel();
int vec_size = max(static_cast<int>(sizeof(float4) / sizeof(T)), 1);
dim3 block_size = dim3(PREDEFINED_BLOCK_SIZE, 1);
dim3 grid_size =
dim3(((size + vec_size - 1) / vec_size + PREDEFINED_BLOCK_SIZE - 1) /
PREDEFINED_BLOCK_SIZE,
1);
SimpleElemwiseAddGradCUDAKernel<
T><<<grid_size, block_size, 0,
ctx.template device_context<plat::CUDADeviceContext>().stream()>>>(
dout->data<T>(), size, vec_size, dx->mutable_data<T>(ctx.GetPlace()),
dy->mutable_data<T>(ctx.GetPlace()));
} else {
VLOG(4) << "Special case when dy_data is the same as dout_data, "
"and dx_data is the same as dout_data, do not need "
"any operator";
}
}

} // namespace operators
namespace operators {} // namespace operators
} // namespace paddle
REGISTER_OP_CUDA_KERNEL(
elementwise_add, ops::ElementwiseAddKernel<plat::CUDADeviceContext, float>,
Expand Down
Loading

0 comments on commit 33bb053

Please sign in to comment.