Skip to content

Commit

Permalink
Merge branch 'develop' into 104
Browse files Browse the repository at this point in the history
  • Loading branch information
zeroRains authored Nov 4, 2023
2 parents d19de99 + 9698830 commit 649b518
Show file tree
Hide file tree
Showing 554 changed files with 18,196 additions and 5,331 deletions.
6 changes: 0 additions & 6 deletions .flake8
Original file line number Diff line number Diff line change
Expand Up @@ -28,9 +28,3 @@ per-file-ignores =

# Ignore compare with True in sot unittest
test/sot/test_dup_top.py:E712

# temp ignore base directory
python/paddle/base/*:
E712,
E266,
E714
10 changes: 10 additions & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -160,3 +160,13 @@ repos:
hooks:
- id: cmakelint
args: [--config=./tools/codestyle/.cmakelintrc]

- repo: local
hooks:
- id: sort-txt-file
name: sort-txt-file
description: Sorts each line string in a text file
entry: python ./tools/codestyle/sort_txt_file.py
language: python
files: test/white_list/new_ir_op_test_white_list
args: []
2 changes: 1 addition & 1 deletion cmake/external/xpu.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ set(XPU_XFT_LIB_NAME "libxft.so")
set(XPU_XPTI_LIB_NAME "libxpti.so")

if(NOT DEFINED XPU_BASE_DATE)
set(XPU_BASE_DATE "20231023")
set(XPU_BASE_DATE "20231025")
endif()
set(XPU_XCCL_BASE_VERSION "1.0.53.6")
if(NOT DEFINED XPU_XFT_BASE_VERSION)
Expand Down
27 changes: 27 additions & 0 deletions paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_bind.cc
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@

#include "paddle/cinn/ir/ir_printer.h"
#include "paddle/cinn/ir/schedule/ir_schedule.h"
#include "paddle/cinn/ir/schedule_block_graph.h"
#include "paddle/cinn/ir/utils/ir_copy.h"
#include "paddle/cinn/ir/utils/ir_nodes_collector.h"

Expand Down Expand Up @@ -94,13 +95,26 @@ void BindGPUIndex(ir::IRSchedule* ir_schedule,
auto all_loops = ir_schedule->GetLoops(block_name);
CHECK_LE(num_loops_to_bind, all_loops.size())
<< "The number of loops to be bind is greater than size of all_loops";
CHECK_GE(num_loops_to_bind, 0)
<< "The number of loops to be bind should be greater than 0";
// check whether it is the case that threadIdx has been binded but blockIdx
// not, the threadIdx can only be binded in the first loop after
// num_loops_to_bind loops because we has excluded other cases in
// CountLoopCanBinded
bool gpu_thread_has_binded =
num_loops_to_bind < all_loops.size() &&
all_loops[num_loops_to_bind].As<ir::For>()->is_gpu_thread_binded();
ir::BlockOrderConstructor block_order_constructor;
std::map<std::vector<int>, ir::Expr> blocks_order_with_ctrl_stmt =
block_order_constructor(&all_loops[num_loops_to_bind - 1]);
for (auto& pair : blocks_order_with_ctrl_stmt) {
if (pair.first.size() == 2) {
ir::Expr stmt = pair.second;
if (stmt.As<ir::For>() && stmt.As<ir::For>()->is_gpu_thread_binded()) {
gpu_thread_has_binded = true;
}
}
}
Expr fused_loop = ir_schedule->Fuse(
{all_loops.begin(), all_loops.begin() + num_loops_to_bind});
int32_t extent = fused_loop.As<ir::For>()->extent.as_int32();
Expand Down Expand Up @@ -181,5 +195,18 @@ std::vector<SearchState> AutoBind::ApplyOnBlock(SearchState state,
return {new_state};
}

void AutoBind::Apply(ir::IRSchedule* ir_schedule,
const std::string& block_name) {
int num_loop_can_bind =
CountLoopCanBinded(ir_schedule->GetLoops(block_name)[0].As<ir::For>());
if (num_loop_can_bind > 0) {
BindGPUIndex(ir_schedule,
block_name,
num_loop_can_bind,
kMaxBlocks,
target_->max_num_threads());
}
}

} // namespace auto_schedule
} // namespace cinn
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,8 @@ class AutoBind : public AutoGenRule {
std::vector<SearchState> ApplyOnBlock(SearchState state,
const std::string& block_name) override;

void Apply(ir::IRSchedule* ir_schedule, const std::string& block_name);

private:
std::vector<Expr> applicable_schedule_blocks_;
};
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
#include "paddle/cinn/ir/ir_base.h"
#include "paddle/cinn/ir/ir_printer.h"
#include "paddle/cinn/ir/schedule/ir_schedule.h"
#include "paddle/cinn/ir/schedule/ir_schedule_util.h"
#include "paddle/cinn/ir/utils/ir_copy.h"
#include "paddle/cinn/ir/utils/ir_nodes_collector.h"

Expand All @@ -49,6 +50,11 @@ bool AutoInline::CanInlineIntoConsumer(const Expr& sche_block_realize_expr,
ir::Expr root = ir_sch->GetRootBlock(sche_block_realize_expr);

// Check the schedule block to be inlined is not a reduce tensor.
for (const ir::Var& iter_var : sche_block->iter_vars) {
if (iter_var->is_reduce_axis) {
return false;
}
}
std::set<ir::Expr> find_store = ir::ir_utils::CollectIRNodesWithoutTensor(
compute_body, [&](const Expr* x) { return x->As<ir::Store>(); });
if (find_store.size() != 1UL) {
Expand All @@ -69,6 +75,29 @@ bool AutoInline::CanInlineIntoConsumer(const Expr& sche_block_realize_expr,
return false;
}

// the xxx_reduce_init block cannot be inlined.
if (ir::IsReduceInitTensorName(tensor->name)) {
return false;
}

// Skip external calls
std::vector<ir::Expr> consumers =
ir::GetConsumers(sche_block_realize_expr, root);
for (const ir::Expr& consumer : consumers) {
std::set<ir::Expr> find_load = ir::ir_utils::CollectIRNodesWithoutTensor(
consumer.As<ir::ScheduleBlockRealize>()
->schedule_block.As<ir::ScheduleBlock>()
->body,
[&](const ir::Expr* x) {
return x->As<ir::Load>() &&
x->As<ir::Load>()->tensor.as_tensor_ref()->name ==
tensor->name;
});
if (find_load.empty()) {
return false;
}
}

// write_buffers.size() = 1 and read_buffers is empty, means const
// we can inline to consumer
if (sche_block->read_buffers.empty()) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,6 @@ class AutoInline : public AutoGenRule {
std::vector<SearchState> ApplyOnBlock(SearchState state,
const std::string& block_name) override;

private:
void Apply(ir::IRSchedule* ir_schedule, ir::Expr& block_expr); // NOLINT

private:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -161,22 +161,42 @@ void ReductionFactoring::Apply(const std::string& block_name,
// 5. Split the reduction loop into 2 part
VLOG(6) << "before Split: " << ir_schedule->GetModule().GetExprs()[0];
int factor = 1;
int max_factor = 1024;
int extent = ir::GetLoopExtent(fused_reduce_loop);
for (int i = ceil(sqrt(extent)); i >= 1; --i) {
for (int i = max_factor; i >= 1; --i) {
if (extent % i == 0) {
factor = i;
break;
}
}
std::vector<cinn::ir::Expr> splited_reduction_loops =
ir_schedule->Split(fused_reduce_loop, {-1, factor});
ir_schedule->Split(fused_reduce_loop, {factor, -1});
// 6. Apply FactorizeReduction
VLOG(6) << "before FactorizeReduction: "
<< ir_schedule->GetModule().GetExprs()[0];
ir_schedule->FactorizeReduction(splited_reduction_loops[0],
num_spatial_loops);
VLOG(6) << "after FactorizeReduction: "
<< ir_schedule->GetModule().GetExprs()[0];

// 7. Loop fusion and cross thread reduction
std::vector<ir::Expr> rb_loops = ir_schedule->GetLoops(block_name);
ir::Expr rf_block = ir_schedule->GetBlock(block_name + "_rf");
ir_schedule->SimpleComputeAt(rf_block, rb_loops.back());

rb_loops = ir_schedule->GetLoops(block_name);
ir::Expr rf_init_block =
ir_schedule->GetBlock(block_name + "_rf__reduce_init");
ir_schedule->SimpleComputeAt(rf_init_block, rb_loops.back());

if (*target_ == common::DefaultNVGPUTarget()) {
rb_loops = ir_schedule->GetLoops(block_name);
rf_block = ir_schedule->GetBlock(block_name + "_rf");
ir_schedule->Bind(rb_loops.back(), "threadIdx.x");
ir_schedule->SetBuffer(rf_block, "shared");
}
VLOG(6) << "Loop fusion and cross thread reduction: "
<< ir_schedule->GetModule().GetExprs()[0];
}

} // namespace auto_schedule
Expand Down
Loading

0 comments on commit 649b518

Please sign in to comment.