Skip to content

Commit

Permalink
Merge branch 'develop' into hackathon_75
Browse files Browse the repository at this point in the history
  • Loading branch information
zhangjun committed Apr 12, 2023
2 parents fc42089 + 57201d9 commit c2a5357
Show file tree
Hide file tree
Showing 431 changed files with 8,709 additions and 3,971 deletions.
2 changes: 1 addition & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -257,7 +257,7 @@ option(WITH_BOX_PS "Compile with box_ps support" OFF)
option(WITH_XBYAK "Compile with xbyak support" ON)
option(WITH_CONTRIB "Compile the third-party contributation" OFF)
option(WITH_PSCORE "Compile with parameter server support" ${WITH_DISTRIBUTE})
option(WITH_HETERPS "Compile with heterps" OFF})
option(WITH_HETERPS "Compile with heterps" OFF)
option(WITH_INFERENCE_API_TEST
"Test fluid inference C++ high-level api interface" OFF)
option(WITH_INFERENCE_NVTX "Paddle inference with nvtx for profiler" OFF)
Expand Down
32 changes: 21 additions & 11 deletions cmake/cuda.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -7,28 +7,33 @@ if(WITH_NV_JETSON)
set(paddle_known_gpu_archs "53 62 72")
set(paddle_known_gpu_archs10 "53 62 72")
set(paddle_known_gpu_archs11 "53 62 72 87")
set(paddle_known_gpu_archs12 "53 62 72 87 90")
elseif(NEW_RELEASE_ALL)
message("Using New Release Strategy - All Arches Packge")
add_definitions(-DNEW_RELEASE_ALL)
set(paddle_known_gpu_archs "35 50 52 60 61 70 75 80 86")
set(paddle_known_gpu_archs10 "35 50 52 60 61 70 75")
set(paddle_known_gpu_archs "50 52 60 61 70 75 80 86 90")
set(paddle_known_gpu_archs10 "50 52 60 61 70 75")
set(paddle_known_gpu_archs11 "50 60 61 70 75 80")
set(paddle_known_gpu_archs12 "50 60 61 70 75 80 90")
elseif(NEW_RELEASE_PYPI)
message("Using New Release Strategy - Cubin Packge")
add_definitions(-DNEW_RELEASE_PYPI)
set(paddle_known_gpu_archs "35 50 52 60 61 70 75 80 86")
set(paddle_known_gpu_archs "50 52 60 61 70 75 80 86 90")
set(paddle_known_gpu_archs10 "")
set(paddle_known_gpu_archs11 "61 70 75 80")
set(paddle_known_gpu_archs12 "61 70 75 80 90")
elseif(NEW_RELEASE_JIT)
message("Using New Release Strategy - JIT Packge")
add_definitions(-DNEW_RELEASE_JIT)
set(paddle_known_gpu_archs "35 50 52 60 61 70 75 80 86")
set(paddle_known_gpu_archs10 "35 50 60 70 75")
set(paddle_known_gpu_archs11 "35 50 60 70 75 80")
set(paddle_known_gpu_archs "50 52 60 61 70 75 80 86 90")
set(paddle_known_gpu_archs10 "50 60 70 75")
set(paddle_known_gpu_archs11 "50 60 70 75 80")
set(paddle_known_gpu_archs12 "50 60 70 75 80 90")
else()
set(paddle_known_gpu_archs "35 50 52 60 61 70 75 80")
set(paddle_known_gpu_archs "50 52 60 61 70 75 80 90")
set(paddle_known_gpu_archs10 "50 52 60 61 70 75")
set(paddle_known_gpu_archs11 "52 60 61 70 75 80")
set(paddle_known_gpu_archs12 "52 60 61 70 75 80 90")
endif()

######################################################################################
Expand Down Expand Up @@ -100,12 +105,12 @@ endfunction()
function(select_nvcc_arch_flags out_variable out_arch_bin)
# List of arch names
set(archs_names
"Kepler"
"Maxwell"
"Pascal"
"Volta"
"Turing"
"Ampere"
"Hopper"
"All"
"Manual")
set(archs_name_default "Auto")
Expand Down Expand Up @@ -144,9 +149,7 @@ function(select_nvcc_arch_flags out_variable out_arch_bin)
unset(CUDA_ARCH_PTX CACHE)
endif()

if(${CUDA_ARCH_NAME} STREQUAL "Kepler")
set(cuda_arch_bin "30 35")
elseif(${CUDA_ARCH_NAME} STREQUAL "Maxwell")
if(${CUDA_ARCH_NAME} STREQUAL "Maxwell")
if(WITH_NV_JETSON)
set(cuda_arch_bin "53")
else()
Expand Down Expand Up @@ -176,6 +179,8 @@ function(select_nvcc_arch_flags out_variable out_arch_bin)
set(cuda_arch_bin "80 86")
endif()
endif()
elseif(${CUDA_ARCH_NAME} STREQUAL "Hopper")
set(cuda_arch_bin "90")
elseif(${CUDA_ARCH_NAME} STREQUAL "All")
set(cuda_arch_bin ${paddle_known_gpu_archs})
elseif(${CUDA_ARCH_NAME} STREQUAL "Auto")
Expand Down Expand Up @@ -266,6 +271,11 @@ elseif(${CMAKE_CUDA_COMPILER_VERSION} LESS 12.0) # CUDA 11.2+
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -D_MWAITXINTRIN_H_INCLUDED")
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -D__STRICT_ANSI__")
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Wno-deprecated-gpu-targets")
elseif(${CMAKE_CUDA_COMPILER_VERSION} LESS 13.0) # CUDA 12.0+
set(paddle_known_gpu_archs "${paddle_known_gpu_archs12} 86")
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -D_MWAITXINTRIN_H_INCLUDED")
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -D__STRICT_ANSI__")
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Wno-deprecated-gpu-targets")
endif()

if(NOT ${CMAKE_CUDA_COMPILER_VERSION} LESS 10.0)
Expand Down
34 changes: 16 additions & 18 deletions cmake/phi_header.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -17,24 +17,21 @@ set(PADDLE_INFERENCE_INSTALL_DIR

function(phi_header_path_compat TARGET_PATH)
message(STATUS "phi header path compat processing: ${TARGET_PATH}")
string(FIND ${TARGET_PATH} "experimental" pos)
if(pos GREATER 1)
file(GLOB HEADERS "${TARGET_PATH}/*" "*.h")
foreach(header ${HEADERS})
if(${header} MATCHES ".*.h$")
file(READ ${header} HEADER_CONTENT)
string(REPLACE "paddle/phi/" "paddle/include/experimental/phi/"
HEADER_CONTENT "${HEADER_CONTENT}")
string(REPLACE "paddle/fluid/platform/"
"paddle/include/experimental/phi/" HEADER_CONTENT
"${HEADER_CONTENT}")
string(REPLACE "paddle/utils/" "paddle/include/experimental/utils/"
HEADER_CONTENT "${HEADER_CONTENT}")
file(WRITE ${header} "${HEADER_CONTENT}")
message(STATUS "phi header path compat processing complete: ${header}")
endif()
endforeach()
endif()
file(GLOB HEADERS "${TARGET_PATH}/*" "*.h")
foreach(header ${HEADERS})
if(${header} MATCHES ".*.h$")
file(READ ${header} HEADER_CONTENT)
string(REPLACE "paddle/phi/" "paddle/include/experimental/phi/"
HEADER_CONTENT "${HEADER_CONTENT}")
string(REPLACE "paddle/fluid/platform/"
"paddle/include/experimental/phi/" HEADER_CONTENT
"${HEADER_CONTENT}")
string(REPLACE "paddle/utils/" "paddle/include/experimental/utils/"
HEADER_CONTENT "${HEADER_CONTENT}")
file(WRITE ${header} "${HEADER_CONTENT}")
message(STATUS "phi header path compat processing complete: ${header}")
endif()
endforeach()
endfunction()

phi_header_path_compat(
Expand All @@ -51,6 +48,7 @@ phi_header_path_compat(
${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/phi/common)
phi_header_path_compat(
${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/phi/core)
phi_header_path_compat(${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/)

# In order to be compatible with the original behavior, the header file name needs to be changed
file(RENAME
Expand Down
2 changes: 1 addition & 1 deletion paddle/fluid/distributed/collective/reducer.cc
Original file line number Diff line number Diff line change
Expand Up @@ -821,9 +821,9 @@ void EagerReducer::MarkVarReady(const size_t var_index,

auto &group = groups_[group_index];
auto &group_tensor = group.dense_tensors_[inside_group_index];
const auto length = group.length_[inside_group_index];

if (!group.is_sparse_) {
const auto length = group.length_[inside_group_index];
if (is_used_var) {
auto *autograd_meta = tensors_[var_index].get_autograd_meta();
auto &grad_tensor =
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,6 @@
# bacward api's output usually affected by backward api's input
special_prune_dict = {
"matmul_grad": {"x": "grad_y", "y": "grad_x"},
"multiply_grad": {"x": "grad_y", "y": "grad_x"},
}


Expand Down Expand Up @@ -276,6 +275,8 @@ class {} : public egr::GradNodeBase {{
// Before log info
{}
// Forward API Call
{}
// Check NaN and Inf if needed
{}
// Get Outputs
{}
Expand Down Expand Up @@ -1675,6 +1676,7 @@ def GenerateForwardDefinitionAndDeclaration(self, is_inplaced):
forward_api_name,
before_log_str,
forward_call_str,
check_nan_inf_str,
get_outputs_str,
forward_api_name,
check_inplace_str,
Expand Down
2 changes: 1 addition & 1 deletion paddle/fluid/eager/backward.cc
Original file line number Diff line number Diff line change
Expand Up @@ -113,7 +113,6 @@ std::vector<paddle::Tensor> RunBackward(

std::queue<GradNodeBase*> force_sequential_nodes_forward_queue =
egr::Controller::Instance().GetForceSequentialNodes();
egr::Controller::Instance().ClearForceSequentialNodes();
std::deque<GradNodeBase*> force_sequential_nodes_queue;
std::set<GradNodeBase*> force_sequential_nodes_set;
std::set<GradNodeBase*> ready_force_sequential_nodes;
Expand Down Expand Up @@ -421,6 +420,7 @@ void Backward(const std::vector<paddle::Tensor>& tensors, // outputs
VLOG(3) << "Run in Backward";
paddle::platform::RecordEvent backward_record_event(
"backward", paddle::platform::TracerEventType::UserDefined, 1);
egr::Controller::Instance().ClearForceSequentialNodes();
RunBackward(tensors, grad_tensors, retain_graph);
phi::autotune::AutoTuneStatus::Instance().Update();
}
Expand Down
13 changes: 13 additions & 0 deletions paddle/fluid/eager/nan_inf_utils.cc
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,11 @@ void CheckTensorHasNanOrInf(const std::string& api_name, const Tensor& tensor) {
}
}

void CheckTensorHasNanOrInf(const std::string& api_name,
const paddle::optional<Tensor>& tensor) {
CheckTensorHasNanOrInf(api_name, tensor.get());
}

void CheckTensorHasNanOrInf(const std::string& api_name,
const TupleOfTwoTensors& tensors) {
CheckTensorHasNanOrInf(api_name, std::get<0>(tensors));
Expand Down Expand Up @@ -169,6 +174,14 @@ void CheckTensorHasNanOrInf(const std::string& api_name,
}
}

void CheckTensorHasNanOrInf(
const std::string& api_name,
const paddle::optional<std::vector<Tensor>>& tensors) {
if (tensors) {
CheckTensorHasNanOrInf(api_name, tensors.get());
}
}

void CheckTensorHasNanOrInf(
const std::string& api_name,
const paddle::small_vector<std::vector<paddle::Tensor>,
Expand Down
8 changes: 8 additions & 0 deletions paddle/fluid/eager/nan_inf_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@

#include "paddle/fluid/eager/type_defs.h"
#include "paddle/phi/api/include/tensor.h"
#include "paddle/utils/optional.h"
#include "paddle/utils/small_vector.h"

namespace egr {
Expand All @@ -36,6 +37,9 @@ using TupleOfTensorAndVector =

void CheckTensorHasNanOrInf(const std::string& api_name, const Tensor& tensor);

void CheckTensorHasNanOrInf(const std::string& api_name,
const paddle::optional<Tensor>& tensor);

void CheckTensorHasNanOrInf(const std::string& api_name,
const TupleOfTwoTensors& tensors);

Expand All @@ -54,6 +58,10 @@ void CheckTensorHasNanOrInf(const std::string& api_name,
void CheckTensorHasNanOrInf(const std::string& api_name,
const std::vector<Tensor>& tensors);

void CheckTensorHasNanOrInf(
const std::string& api_name,
const paddle::optional<std::vector<Tensor>>& tensors);

void CheckTensorHasNanOrInf(const std::string& api_name,
const TupleOfTensorAndVector& tensors);

Expand Down
13 changes: 7 additions & 6 deletions paddle/fluid/framework/data_type.h
Original file line number Diff line number Diff line change
Expand Up @@ -83,12 +83,13 @@ struct DataTypeTrait<void> {
_ForEachDataTypeHelper_( \
callback, ::paddle::platform::complex<double>, COMPLEX128);

#define _ForEachDataTypeNormal_(callback) \
_ForEachDataTypeHelper_(callback, float, FP32); \
_ForEachDataTypeHelper_(callback, double, FP64); \
_ForEachDataTypeHelper_(callback, int, INT32); \
_ForEachDataTypeHelper_(callback, int64_t, INT64); \
_ForEachDataTypeHelper_(callback, ::paddle::platform::float16, FP16);
#define _ForEachDataTypeNormal_(callback) \
_ForEachDataTypeHelper_(callback, float, FP32); \
_ForEachDataTypeHelper_(callback, double, FP64); \
_ForEachDataTypeHelper_(callback, int, INT32); \
_ForEachDataTypeHelper_(callback, int64_t, INT64); \
_ForEachDataTypeHelper_(callback, ::paddle::platform::float16, FP16); \
_ForEachDataTypeHelper_(callback, ::paddle::platform::bfloat16, BF16);

// For the use of thrust, as index-type elements can be only integers.
#define _ForEachDataTypeTiny_(callback) \
Expand Down
8 changes: 8 additions & 0 deletions paddle/fluid/framework/distributed_strategy.proto
Original file line number Diff line number Diff line change
Expand Up @@ -50,11 +50,19 @@ message ShardingConfig {
optional bool enable_tuning = 15 [ default = false ]; // incubate for auto parallel
}

// for dygraph
message MpConfig {
optional bool sync_param= 1 [ default = false ];
optional bool sync_grad= 2 [ default = false ];
optional bool sync_moment= 3 [ default = false ];
}

message HybridConfig {
optional int32 dp_degree = 1 [ default = -1 ];
optional int32 mp_degree = 2 [ default = 1 ];
optional int32 pp_degree = 3 [ default = 1 ];
optional int32 sharding_degree = 4 [ default = 1 ];
optional MpConfig mp_configs = 5;
}

message AMPConfig {
Expand Down
2 changes: 1 addition & 1 deletion paddle/fluid/framework/feed_fetch_method.cc
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@ phi::DenseTensor& GetVariableTensor(const Scope& scope,
PADDLE_ENFORCE_EQ(var->IsType<phi::DenseTensor>(),
true,
platform::errors::InvalidArgument(
"Only support lod tensor in GetVariableTensor now."));
"Only support DenseTensor in GetVariableTensor now."));
return *var->GetMutable<phi::DenseTensor>();
}

Expand Down
11 changes: 6 additions & 5 deletions paddle/fluid/framework/ir/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,7 @@ pass_library(matmul_scale_fuse_pass inference)
pass_library(gpu_cpu_map_matmul_to_mul_pass inference)
pass_library(dense_fc_to_sparse_pass inference)
pass_library(dense_multihead_matmul_to_sparse_pass inference)
pass_library(delete_cast_op_pass inference)
pass_library(generate_pass DEPS pass_desc_proto)
target_link_libraries(generate_pass pass_desc_proto)

Expand Down Expand Up @@ -242,7 +243,6 @@ if(WITH_XPU)
pass_library(fused_multi_transformer_xpu_quant_pass inference DIR xpu DEPS
${XPU_PASS_DEPS})
pass_library(stack_fuse_pass inference DIR xpu DEPS ${XPU_PASS_DEPS})
pass_library(delete_cast_op_pass inference DIR xpu DEPS ${XPU_PASS_DEPS})
endif()

cc_library(
Expand Down Expand Up @@ -407,6 +407,11 @@ cc_test(
test_delete_dequant_weight_linear_op_pass
SRCS delete_weight_dequant_linear_op_pass_tester.cc
DEPS delete_weight_dequant_linear_op_pass)
cc_test(
test_delete_cast_op_pass
SRCS delete_cast_op_pass_test.cc
DEPS delete_cast_op_pass)

if(WITH_GPU OR WITH_ROCM)
cc_test(
test_embedding_eltwise_layernorm_fuse_pass
Expand Down Expand Up @@ -521,8 +526,4 @@ if(WITH_XPU)
test_stack_fuse_pass
SRCS xpu/stack_fuse_pass_test.cc
DEPS stack_fuse_pass)
cc_test(
test_delete_cast_op_pass
SRCS xpu/delete_cast_op_pass_test.cc
DEPS delete_cast_op_pass)
endif()
Loading

0 comments on commit c2a5357

Please sign in to comment.