diff --git a/paddle/cinn/hlir/dialect/operator/transforms/group_merge/group_with_group_merge_pass_utils.h b/paddle/cinn/hlir/dialect/operator/transforms/group_merge/group_with_group_merge_pass_utils.h index 7e874ecb8e95a..923ed2fe88296 100644 --- a/paddle/cinn/hlir/dialect/operator/transforms/group_merge/group_with_group_merge_pass_utils.h +++ b/paddle/cinn/hlir/dialect/operator/transforms/group_merge/group_with_group_merge_pass_utils.h @@ -140,7 +140,7 @@ static int GetSharedSize(const cinn::dialect::ir::OpNode& op_node) { lane = inshape[idx]; } // int max_num_threads = - // cinn::common::DefaultNVGPUTarget().max_num_threads(); + // cinn::common::DefaultDeviceTarget().max_num_threads(); int max_num_threads = 1000; if (lane > max_num_threads / 2) { return 0; diff --git a/paddle/cinn/hlir/dialect/operator/transforms/group_merge/op_with_group_merge_pass.cc b/paddle/cinn/hlir/dialect/operator/transforms/group_merge/op_with_group_merge_pass.cc index 751c2d62a5235..fc2dd62764589 100644 --- a/paddle/cinn/hlir/dialect/operator/transforms/group_merge/op_with_group_merge_pass.cc +++ b/paddle/cinn/hlir/dialect/operator/transforms/group_merge/op_with_group_merge_pass.cc @@ -197,8 +197,8 @@ int GetSharedSize(::pir::Operation* op) { lane = inshape[idx]; } // int max_num_threads = - // cinn::common::DefaultNVGPUTarget().max_num_threads(); todo(phlrain): get - // gpu max threads + // cinn::common::DefaultDeviceTarget().max_num_threads(); + // todo(phlrain): get gpu max threads int max_num_threads = 2048; if (lane > max_num_threads / 2) { return 0; diff --git a/paddle/cinn/hlir/dialect/operator/transforms/lowering_pass/pre_analysis.cc b/paddle/cinn/hlir/dialect/operator/transforms/lowering_pass/pre_analysis.cc index e8dbe22b5412d..49407a86557fc 100644 --- a/paddle/cinn/hlir/dialect/operator/transforms/lowering_pass/pre_analysis.cc +++ b/paddle/cinn/hlir/dialect/operator/transforms/lowering_pass/pre_analysis.cc @@ -56,7 +56,7 @@ void FusionOpAnalysis::PreCompileGroup() { } // Build and trigger compilaion cache. VLOG(4) << "Parallel Pre-Compile for Group with size: " << groups.size(); - PirCompiler pir_compiler(cinn::common::DefaultNVGPUTarget()); + PirCompiler pir_compiler(cinn::common::DefaultDeviceTarget()); pir_compiler.Build(groups); } } // namespace cinn::dialect::ir::details diff --git a/paddle/cinn/hlir/dialect/operator/transforms/lowering_pass/utils.cc b/paddle/cinn/hlir/dialect/operator/transforms/lowering_pass/utils.cc index dd6e0ecdf4160..a36c208f0c96c 100644 --- a/paddle/cinn/hlir/dialect/operator/transforms/lowering_pass/utils.cc +++ b/paddle/cinn/hlir/dialect/operator/transforms/lowering_pass/utils.cc @@ -61,7 +61,7 @@ std::vector GetBlockOutsideInput( std::unordered_map> CompileGroupAsOpAttribute(const std::vector& group_list) { - PirCompiler pir_compiler(cinn::common::DefaultNVGPUTarget()); + PirCompiler pir_compiler(cinn::common::DefaultDeviceTarget()); auto fn_ptr_res = pir_compiler.Build(group_list); std::unordered_map GetJitKernelAttr( hlir::framework::pir::FusionInfo fusion_info(*group); return CompilationCache::Instance().GetKernelInfo(fusion_info); } else { - PirCompiler pir_compiler(cinn::common::DefaultNVGPUTarget()); + PirCompiler pir_compiler(cinn::common::DefaultDeviceTarget()); return pir_compiler.Build({group})[0]; } }; diff --git a/paddle/cinn/hlir/framework/op_lowering_util.cc b/paddle/cinn/hlir/framework/op_lowering_util.cc index 1948a5189b6f1..68090597a8ecc 100644 --- a/paddle/cinn/hlir/framework/op_lowering_util.cc +++ b/paddle/cinn/hlir/framework/op_lowering_util.cc @@ -717,7 +717,7 @@ void LoopAssignReduceWithLast(ir::IRSchedule& ir_sch, // NOLINT // If the number of current device SM is smaller than the number of SM // required by Warp Reduce, the performance of Warp Reduce is better. // Otherwise, use Block Reduce. - auto max_num_threads = cinn::common::DefaultNVGPUTarget().max_num_threads(); + auto max_num_threads = cinn::common::DefaultDeviceTarget().max_num_threads(); int need_reduce_last_count = 1; for (int i = 0; i < inshape.size(); i++) { if (find(axes.begin(), axes.end(), i) == axes.end()) { diff --git a/paddle/cinn/hlir/framework/pir/op_lowering_util.cc b/paddle/cinn/hlir/framework/pir/op_lowering_util.cc index 56c335f6b63ca..029ec700935bb 100644 --- a/paddle/cinn/hlir/framework/pir/op_lowering_util.cc +++ b/paddle/cinn/hlir/framework/pir/op_lowering_util.cc @@ -577,7 +577,7 @@ void LoopAssignReduceWithLast(ir::IRSchedule& ir_sch, // NOLINT // If the number of current device SM is smaller than the number of SM // required by Warp Reduce, the performance of Warp Reduce is better. // Otherwise, use Block Reduce. - auto max_num_threads = cinn::common::DefaultNVGPUTarget().max_num_threads(); + auto max_num_threads = cinn::common::DefaultDeviceTarget().max_num_threads(); int need_reduce_last_count = 1; for (int i = 0; i < inshape.size(); i++) { if (find(axes.begin(), axes.end(), i) == axes.end()) { diff --git a/paddle/cinn/hlir/op/reduction.cc b/paddle/cinn/hlir/op/reduction.cc index 0ef92fb6ee309..e162084326245 100644 --- a/paddle/cinn/hlir/op/reduction.cc +++ b/paddle/cinn/hlir/op/reduction.cc @@ -263,7 +263,7 @@ std::shared_ptr StrategyForReduce( reduce_tmp_out.as_tensor_ref(), tmp_out.as_tensor_ref(), out.as_tensor_ref(), - cinn::common::DefaultNVGPUTarget()); + cinn::common::DefaultDeviceTarget()); std::vector res{ CINNValue(ir_sch.GetModule().GetExprs().at(0))}; @@ -279,7 +279,7 @@ std::shared_ptr StrategyForReduce( reduce_tmp_out.as_tensor_ref(), tmp_out.as_tensor_ref(), out.as_tensor_ref(), - cinn::common::DefaultNVGPUTarget()); + cinn::common::DefaultDeviceTarget()); std::vector res{ CINNValue(ir_sch.GetModule().GetExprs().at(0))}; diff --git a/paddle/cinn/hlir/pe/reduction.cc b/paddle/cinn/hlir/pe/reduction.cc index b831d1b588472..12d7e48e44e0b 100644 --- a/paddle/cinn/hlir/pe/reduction.cc +++ b/paddle/cinn/hlir/pe/reduction.cc @@ -841,7 +841,7 @@ std::vector TwoStepBlockReduceInternal( // If the number of current device SM is smaller than the number of SM // required by Warp Reduce, the performance of Warp Reduce is better. // Otherwise, use Block Reduce. - auto max_num_threads = cinn::common::DefaultNVGPUTarget().max_num_threads(); + auto max_num_threads = cinn::common::DefaultDeviceTarget().max_num_threads(); int need_reduce_last_count = 1; for (int i = 0; i < A->shape.size(); i++) { if (find(axes.begin(), axes.end(), i) == axes.end()) { @@ -851,9 +851,9 @@ std::vector TwoStepBlockReduceInternal( int warp_reduce_need_sm_count = ceil((need_reduce_last_count * 32) / static_cast( - cinn::common::DefaultNVGPUTarget().get_max_threads_per_sm())); + cinn::common::DefaultDeviceTarget().get_max_threads_per_sm())); // Set Num_max_threads to 32 is Warp Reduce - if (cinn::common::DefaultNVGPUTarget().get_multi_processor_count() < + if (cinn::common::DefaultDeviceTarget().get_multi_processor_count() < warp_reduce_need_sm_count) { max_num_threads = 32; } diff --git a/paddle/cinn/ir/group_schedule/tactic/tile_tactic.cc b/paddle/cinn/ir/group_schedule/tactic/tile_tactic.cc index 114a539e4e3f6..0aaf620874568 100644 --- a/paddle/cinn/ir/group_schedule/tactic/tile_tactic.cc +++ b/paddle/cinn/ir/group_schedule/tactic/tile_tactic.cc @@ -46,7 +46,7 @@ void TileTactic::Init(ScheduleContext* context) { }; auto GetTreeReduceSize = [&](const ir::Expr& total_rb_extent) -> int64_t { const int64_t max_num_threads = - common::DefaultNVGPUTarget().max_num_threads(); + cinn::common::DefaultDeviceTarget().max_num_threads(); int64_t nums_thread_per_block = max_num_threads; if (total_rb_extent.is_constant()) { int64_t extent = static_cast(total_rb_extent.get_constant()); diff --git a/paddle/cinn/optim/map_extern_call.cc b/paddle/cinn/optim/map_extern_call.cc index 1b9bbf1e57374..c7a71629067ca 100644 --- a/paddle/cinn/optim/map_extern_call.cc +++ b/paddle/cinn/optim/map_extern_call.cc @@ -91,7 +91,7 @@ void DealWithIntrinsicsImpl(common::NVGPUArch, ir::Call *node, Expr *expr) { } std::string extern_func = - hlir::GetExternFuncName(cinn::common::DefaultNVGPUTarget(), dtype, name); + hlir::GetExternFuncName(cinn::common::DefaultDeviceTarget(), dtype, name); *expr = lang::CallExtern(extern_func, node->read_args, node->attrs); }