[CINN][New Hardware Update] Replace nv target

* replace DefaultNVGPUTarget with DefaultDeviceTarget
PaddlePaddle · May 17, 2024 · 01dd7f9 · 01dd7f9
1 parent 2188b4a
commit 01dd7f9
Show file tree

Hide file tree

Showing 10 changed files with 15 additions and 15 deletions.
diff --git a/paddle/cinn/hlir/dialect/operator/transforms/group_merge/group_with_group_merge_pass_utils.h b/paddle/cinn/hlir/dialect/operator/transforms/group_merge/group_with_group_merge_pass_utils.h
@@ -140,7 +140,7 @@ static int GetSharedSize(const cinn::dialect::ir::OpNode& op_node) {
       lane = inshape[idx];
     }
     // int max_num_threads =
-    // cinn::common::DefaultNVGPUTarget().max_num_threads();
+    // cinn::common::DefaultDeviceTarget().max_num_threads();
     int max_num_threads = 1000;
     if (lane > max_num_threads / 2) {
       return 0;

diff --git a/paddle/cinn/hlir/dialect/operator/transforms/group_merge/op_with_group_merge_pass.cc b/paddle/cinn/hlir/dialect/operator/transforms/group_merge/op_with_group_merge_pass.cc
@@ -197,8 +197,8 @@ int GetSharedSize(::pir::Operation* op) {
       lane = inshape[idx];
     }
     // int max_num_threads =
-    // cinn::common::DefaultNVGPUTarget().max_num_threads(); todo(phlrain): get
-    // gpu max threads
+    // cinn::common::DefaultDeviceTarget().max_num_threads();
+    // todo(phlrain): get gpu max threads
     int max_num_threads = 2048;
     if (lane > max_num_threads / 2) {
       return 0;

diff --git a/paddle/cinn/hlir/dialect/operator/transforms/lowering_pass/pre_analysis.cc b/paddle/cinn/hlir/dialect/operator/transforms/lowering_pass/pre_analysis.cc
@@ -56,7 +56,7 @@ void FusionOpAnalysis::PreCompileGroup() {
   }
   // Build and trigger compilaion cache.
   VLOG(4) << "Parallel Pre-Compile for Group with size: " << groups.size();
-  PirCompiler pir_compiler(cinn::common::DefaultNVGPUTarget());
+  PirCompiler pir_compiler(cinn::common::DefaultDeviceTarget());
   pir_compiler.Build(groups);
 }
 }  // namespace cinn::dialect::ir::details
diff --git a/paddle/cinn/hlir/dialect/operator/transforms/lowering_pass/utils.cc b/paddle/cinn/hlir/dialect/operator/transforms/lowering_pass/utils.cc
@@ -61,7 +61,7 @@ std::vector<pir::Value> GetBlockOutsideInput(
 std::unordered_map<OpLoweringGroupPtr,
                    std::unordered_map<std::string, pir::Attribute>>
 CompileGroupAsOpAttribute(const std::vector<OpLoweringGroupPtr>& group_list) {
-  PirCompiler pir_compiler(cinn::common::DefaultNVGPUTarget());
+  PirCompiler pir_compiler(cinn::common::DefaultDeviceTarget());
   auto fn_ptr_res = pir_compiler.Build(group_list);
 
   std::unordered_map<OpLoweringGroupPtr,
@@ -85,7 +85,7 @@ std::unordered_map<std::string, ::pir::Attribute> GetJitKernelAttr(
       hlir::framework::pir::FusionInfo fusion_info(*group);
       return CompilationCache::Instance().GetKernelInfo(fusion_info);
     } else {
-      PirCompiler pir_compiler(cinn::common::DefaultNVGPUTarget());
+      PirCompiler pir_compiler(cinn::common::DefaultDeviceTarget());
       return pir_compiler.Build({group})[0];
     }
   };

diff --git a/paddle/cinn/hlir/framework/op_lowering_util.cc b/paddle/cinn/hlir/framework/op_lowering_util.cc
@@ -717,7 +717,7 @@ void LoopAssignReduceWithLast(ir::IRSchedule& ir_sch,  // NOLINT
   // If the number of current device SM is smaller than the number of SM
   // required by Warp Reduce, the performance of Warp Reduce is better.
   // Otherwise, use Block Reduce.
-  auto max_num_threads = cinn::common::DefaultNVGPUTarget().max_num_threads();
+  auto max_num_threads = cinn::common::DefaultDeviceTarget().max_num_threads();
   int need_reduce_last_count = 1;
   for (int i = 0; i < inshape.size(); i++) {
     if (find(axes.begin(), axes.end(), i) == axes.end()) {

diff --git a/paddle/cinn/hlir/framework/pir/op_lowering_util.cc b/paddle/cinn/hlir/framework/pir/op_lowering_util.cc
@@ -577,7 +577,7 @@ void LoopAssignReduceWithLast(ir::IRSchedule& ir_sch,  // NOLINT
   // If the number of current device SM is smaller than the number of SM
   // required by Warp Reduce, the performance of Warp Reduce is better.
   // Otherwise, use Block Reduce.
-  auto max_num_threads = cinn::common::DefaultNVGPUTarget().max_num_threads();
+  auto max_num_threads = cinn::common::DefaultDeviceTarget().max_num_threads();
   int need_reduce_last_count = 1;
   for (int i = 0; i < inshape.size(); i++) {
     if (find(axes.begin(), axes.end(), i) == axes.end()) {

diff --git a/paddle/cinn/hlir/op/reduction.cc b/paddle/cinn/hlir/op/reduction.cc
@@ -263,7 +263,7 @@ std::shared_ptr<OpStrategy> StrategyForReduce(
                                          reduce_tmp_out.as_tensor_ref(),
                                          tmp_out.as_tensor_ref(),
                                          out.as_tensor_ref(),
-                                         cinn::common::DefaultNVGPUTarget());
+                                         cinn::common::DefaultDeviceTarget());
 
           std::vector<CINNValue> res{
               CINNValue(ir_sch.GetModule().GetExprs().at(0))};
@@ -279,7 +279,7 @@ std::shared_ptr<OpStrategy> StrategyForReduce(
                                        reduce_tmp_out.as_tensor_ref(),
                                        tmp_out.as_tensor_ref(),
                                        out.as_tensor_ref(),
-                                       cinn::common::DefaultNVGPUTarget());
+                                       cinn::common::DefaultDeviceTarget());
 
           std::vector<CINNValue> res{
               CINNValue(ir_sch.GetModule().GetExprs().at(0))};

diff --git a/paddle/cinn/hlir/pe/reduction.cc b/paddle/cinn/hlir/pe/reduction.cc
@@ -841,7 +841,7 @@ std::vector<ir::Tensor> TwoStepBlockReduceInternal(
   // If the number of current device SM is smaller than the number of SM
   // required by Warp Reduce, the performance of Warp Reduce is better.
   // Otherwise, use Block Reduce.
-  auto max_num_threads = cinn::common::DefaultNVGPUTarget().max_num_threads();
+  auto max_num_threads = cinn::common::DefaultDeviceTarget().max_num_threads();
   int need_reduce_last_count = 1;
   for (int i = 0; i < A->shape.size(); i++) {
     if (find(axes.begin(), axes.end(), i) == axes.end()) {
@@ -851,9 +851,9 @@ std::vector<ir::Tensor> TwoStepBlockReduceInternal(
   int warp_reduce_need_sm_count =
       ceil((need_reduce_last_count * 32) /
            static_cast<float>(
-               cinn::common::DefaultNVGPUTarget().get_max_threads_per_sm()));
+               cinn::common::DefaultDeviceTarget().get_max_threads_per_sm()));
   // Set Num_max_threads to 32 is Warp Reduce
-  if (cinn::common::DefaultNVGPUTarget().get_multi_processor_count() <
+  if (cinn::common::DefaultDeviceTarget().get_multi_processor_count() <
       warp_reduce_need_sm_count) {
     max_num_threads = 32;
   }

diff --git a/paddle/cinn/ir/group_schedule/tactic/tile_tactic.cc b/paddle/cinn/ir/group_schedule/tactic/tile_tactic.cc
@@ -46,7 +46,7 @@ void TileTactic::Init(ScheduleContext* context) {
   };
   auto GetTreeReduceSize = [&](const ir::Expr& total_rb_extent) -> int64_t {
     const int64_t max_num_threads =
-        common::DefaultNVGPUTarget().max_num_threads();
+        cinn::common::DefaultDeviceTarget().max_num_threads();
     int64_t nums_thread_per_block = max_num_threads;
     if (total_rb_extent.is_constant()) {
       int64_t extent = static_cast<int64_t>(total_rb_extent.get_constant());

diff --git a/paddle/cinn/optim/map_extern_call.cc b/paddle/cinn/optim/map_extern_call.cc
@@ -91,7 +91,7 @@ void DealWithIntrinsicsImpl(common::NVGPUArch, ir::Call *node, Expr *expr) {
   }
 
   std::string extern_func =
-      hlir::GetExternFuncName(cinn::common::DefaultNVGPUTarget(), dtype, name);
+      hlir::GetExternFuncName(cinn::common::DefaultDeviceTarget(), dtype, name);
   *expr = lang::CallExtern(extern_func, node->read_args, node->attrs);
 }