PaddlePaddle · tc20042008 · May 15, 2024 · May 15, 2024
diff --git a/paddle/cinn/hlir/op/contrib/gather_nd.cc b/paddle/cinn/hlir/op/contrib/gather_nd.cc
@@ -195,7 +195,7 @@ std::shared_ptr<framework::OpStrategy> StrategyForGatherNd(
           },
           [&](common::ARMArch) { CINN_NOT_IMPLEMENTED; },
           [&](common::NVGPUArch) {
-            pe::IRCudaScheduleInjective(ir_sch, output_shapes.front(), target);
+            pe::IRGpuScheduleInjective(ir_sch, output_shapes.front(), target);
           },
       });
     }

diff --git a/paddle/cinn/hlir/op/contrib/repeat.cc b/paddle/cinn/hlir/op/contrib/repeat.cc
@@ -206,7 +206,7 @@ std::shared_ptr<framework::OpStrategy> StrategyForRepeat(
           },
           [&](common::ARMArch) { CINN_NOT_IMPLEMENTED; },
           [&](common::NVGPUArch) {
-            pe::IRCudaScheduleInjective(ir_sch, output_shapes.front(), target);
+            pe::IRGpuScheduleInjective(ir_sch, output_shapes.front(), target);
           },
       });
     }

diff --git a/paddle/cinn/hlir/op/contrib/resize.cc b/paddle/cinn/hlir/op/contrib/resize.cc
@@ -252,7 +252,7 @@ std::shared_ptr<framework::OpStrategy> StrategyForResize(
           },
           [&](common::ARMArch) { CINN_NOT_IMPLEMENTED; },
           [&](common::NVGPUArch) {
-            pe::IRCudaScheduleInjective(ir_sch, output_shapes.front(), target);
+            pe::IRGpuScheduleInjective(ir_sch, output_shapes.front(), target);
           },
       });
     }

diff --git a/paddle/cinn/hlir/op/nn.cc b/paddle/cinn/hlir/op/nn.cc
@@ -394,7 +394,7 @@ std::shared_ptr<OpStrategy> StrategyForConv2d(
           // gen, this code is to be removed.
           if (conv_type != "forward") {
             CHECK_EQ(vec_ast.size(), 1);
-            pe::IRCudaScheduleInjective(ir_sch, output_shapes.front(), target);
+            pe::IRGpuScheduleInjective(ir_sch, output_shapes.front(), target);
             std::vector<CINNValue> res{
                 CINNValue(ir_sch.GetModule().GetExprs().at(0))};
             *ret = CINNValuePack{res};

diff --git a/paddle/cinn/hlir/op/reduction.cc b/paddle/cinn/hlir/op/reduction.cc
@@ -227,8 +227,8 @@ std::shared_ptr<OpStrategy> StrategyForReduce(
           Expr out = vec_tensor[0];
           Expr tmp_out = vec_tensor[1];
 
-          VLOG(3) << "Do IRCudaScheduleBlockReduceInternal Schedule!";
-          pe::IRCudaScheduleBlockReduceInternal(
+          VLOG(3) << "Do IRGpuScheduleBlockReduceInternal Schedule!";
+          pe::IRGpuScheduleBlockReduceInternal(
               ir_sch, tmp_out.as_tensor_ref(), out.as_tensor_ref(), target);
 
           std::vector<CINNValue> res{
@@ -240,12 +240,12 @@ std::shared_ptr<OpStrategy> StrategyForReduce(
           Expr tmp_out = vec_tensor[1];
           Expr reduce_tmp_out = vec_tensor[2];
 
-          VLOG(3) << "Do IRCudaScheduleBlockReduce Schedule!";
-          pe::IRCudaScheduleBlockReduce(ir_sch,
-                                        reduce_tmp_out.as_tensor_ref(),
-                                        tmp_out.as_tensor_ref(),
-                                        out.as_tensor_ref(),
-                                        target);
+          VLOG(3) << "Do IRGpuScheduleBlockReduce Schedule!";
+          pe::IRGpuScheduleBlockReduce(ir_sch,
+                                       reduce_tmp_out.as_tensor_ref(),
+                                       tmp_out.as_tensor_ref(),
+                                       out.as_tensor_ref(),
+                                       target);
 
           std::vector<CINNValue> res{
               CINNValue(ir_sch.GetModule().GetExprs().at(0))};
@@ -257,13 +257,13 @@ std::shared_ptr<OpStrategy> StrategyForReduce(
           Expr reduce_tmp_out = vec_tensor[2];
           Expr reshape = vec_tensor[3];
 
-          VLOG(3) << "Do IRCudaTwoStepReduceSchedule Schedule!";
-          pe::IRCudaTwoStepReduceSchedule(ir_sch,
-                                          reshape.as_tensor_ref(),
-                                          reduce_tmp_out.as_tensor_ref(),
-                                          tmp_out.as_tensor_ref(),
-                                          out.as_tensor_ref(),
-                                          cinn::common::DefaultNVGPUTarget());
+          VLOG(3) << "Do IRGpuTwoStepReduceSchedule Schedule!";
+          pe::IRGpuTwoStepReduceSchedule(ir_sch,
+                                         reshape.as_tensor_ref(),
+                                         reduce_tmp_out.as_tensor_ref(),
+                                         tmp_out.as_tensor_ref(),
+                                         out.as_tensor_ref(),
+                                         cinn::common::DefaultNVGPUTarget());
 
           std::vector<CINNValue> res{
               CINNValue(ir_sch.GetModule().GetExprs().at(0))};
@@ -274,12 +274,12 @@ std::shared_ptr<OpStrategy> StrategyForReduce(
           Expr tmp_out = vec_tensor[1];
           Expr reduce_tmp_out = vec_tensor[2];
 
-          VLOG(3) << "Do IRCudaScheduleBlockReduce Schedule!";
-          pe::IRCudaScheduleBlockReduce(ir_sch,
-                                        reduce_tmp_out.as_tensor_ref(),
-                                        tmp_out.as_tensor_ref(),
-                                        out.as_tensor_ref(),
-                                        cinn::common::DefaultNVGPUTarget());
+          VLOG(3) << "Do IRGpuScheduleBlockReduce Schedule!";
+          pe::IRGpuScheduleBlockReduce(ir_sch,
+                                       reduce_tmp_out.as_tensor_ref(),
+                                       tmp_out.as_tensor_ref(),
+                                       out.as_tensor_ref(),
+                                       cinn::common::DefaultNVGPUTarget());
 
           std::vector<CINNValue> res{
               CINNValue(ir_sch.GetModule().GetExprs().at(0))};
@@ -292,8 +292,8 @@ std::shared_ptr<OpStrategy> StrategyForReduce(
           CHECK_EQ(vec_tensor.size(), 1);
           Expr reduce_out = vec_tensor[0];
 
-          VLOG(3) << "Do IRCudaScheduleReduce Schedule!";
-          pe::IRCudaScheduleReduce(
+          VLOG(3) << "Do IRGpuScheduleReduce Schedule!";
+          pe::IRGpuScheduleReduce(
               ir_sch,
               reduce_out.as_tensor_ref(),
               inputs[0]->shape.size() - reduce_axes.back() - 1,
@@ -308,12 +308,12 @@ std::shared_ptr<OpStrategy> StrategyForReduce(
           Expr reduce_internal = vec_tensor[1];
           Expr reduce_reshape = vec_tensor[2];
 
-          VLOG(3) << "Do IRCudaScheduleBlockShuffleReduce Schedule!";
-          pe::IRCudaScheduleBlockShuffleReduce(ir_sch,
-                                               reduce_reshape.as_tensor_ref(),
-                                               reduce_internal.as_tensor_ref(),
-                                               reduce_out.as_tensor_ref(),
-                                               target);
+          VLOG(3) << "Do IRGpuScheduleBlockShuffleReduce Schedule!";
+          pe::IRGpuScheduleBlockShuffleReduce(ir_sch,
+                                              reduce_reshape.as_tensor_ref(),
+                                              reduce_internal.as_tensor_ref(),
+                                              reduce_out.as_tensor_ref(),
+                                              target);
 
           std::vector<CINNValue> res{
               CINNValue(ir_sch.GetModule().GetExprs().at(0))};

diff --git a/paddle/cinn/hlir/op/transform.cc b/paddle/cinn/hlir/op/transform.cc
@@ -134,7 +134,7 @@ std::shared_ptr<OpStrategy> StrategyForMatMul(
         << "The input argument of matmul schedule is empty! Please check.\n";
     CINNValuePack arg_pack = args[0];
     std::vector<CINNValue> results =
-        pe::IRCudaScheduleMatMul(arg_pack, output_shape, target);
+        pe::IRGpuScheduleMatMul(arg_pack, output_shape, target);
     *ret = CINNValuePack({results});
   });
 
@@ -660,7 +660,7 @@ std::shared_ptr<OpStrategy> StrategyForMul(
         << "The input argument of matmul schedule is empty! Please check.\n";
     CINNValuePack arg_pack = args[0];
     std::vector<CINNValue> results =
-        pe::IRCudaScheduleMatMul(arg_pack, output_shape, target);
+        pe::IRGpuScheduleMatMul(arg_pack, output_shape, target);
     *ret = CINNValuePack({results});
   });
 

diff --git a/paddle/cinn/hlir/pe/ir_schedule_pe.cc b/paddle/cinn/hlir/pe/ir_schedule_pe.cc
@@ -157,10 +157,10 @@ void IRScheduleInjectiveCPU(ir::IRSchedule &ir_sch,  // NOLINT
           << ir_sch.GetModule().GetExprs().at(0);
 }
 
-void IRCudaScheduleInjective(ir::IRSchedule &ir_sch,  // NOLINT
-                             const std::vector<int> &output_shape,
-                             const cinn::common::Target &target) {
-  VLOG(3) << "Begin IRCudaScheduleInjective ";
+void IRGpuScheduleInjective(ir::IRSchedule &ir_sch,  // NOLINT
+                            const std::vector<int> &output_shape,
+                            const cinn::common::Target &target) {
+  VLOG(3) << "Begin IRGpuScheduleInjective ";
   auto all_blocks = ir_sch.GetAllBlocks();
   auto loops = ir_sch.GetLoops(all_blocks[0]);
   auto fused = ir_sch.Fuse(loops);
@@ -176,11 +176,11 @@ void IRCudaScheduleInjective(ir::IRSchedule &ir_sch,  // NOLINT
   } else {
     ir_sch.Bind(fused, "threadIdx.x");
   }
-  VLOG(3) << "After IRCudaScheduleInjective, new ir is : "
+  VLOG(3) << "After IRGpuScheduleInjective, new ir is : "
           << ir_sch.GetModule().GetExprs().at(0);
 }
 
-std::vector<cinn::common::CINNValue> IRCudaScheduleMatMul(
+std::vector<cinn::common::CINNValue> IRGpuScheduleMatMul(
     const cinn::common::CINNValuePack &arg_pack,
     const std::vector<int> &output_shape,
     const cinn::common::Target &target) {
@@ -359,11 +359,11 @@ void IRCudaSplitSchedule(ir::IRSchedule &ir_sch,  // NOLINT
           << ir_sch.GetModule().GetExprs().at(0);
 }
 
-void IRCudaScheduleReduce(ir::IRSchedule &ir_sch,  // NOLINT
-                          ir::Tensor output,
-                          int last_dimension_num,
-                          const cinn::common::Target &target) {
-  VLOG(3) << "Before IRCudaScheduleReduce : "
+void IRGpuScheduleReduce(ir::IRSchedule &ir_sch,  // NOLINT
+                         ir::Tensor output,
+                         int last_dimension_num,
+                         const cinn::common::Target &target) {
+  VLOG(3) << "Before IRGpuScheduleReduce : "
           << ir_sch.GetModule().GetExprs().at(0);
   int parallel_thread_num = 1;
   auto &output_shape = output->shape;
@@ -411,15 +411,15 @@ void IRCudaScheduleReduce(ir::IRSchedule &ir_sch,  // NOLINT
     auto loops = ir_sch.GetLoops(output->name);
     ir_sch.Bind(loops[0], "blockIdx.x");
   }
-  VLOG(3) << "After IRCudaScheduleReduce : "
+  VLOG(3) << "After IRGpuScheduleReduce : "
           << ir_sch.GetModule().GetExprs().at(0);
 }
 
-void IRCudaScheduleBlockReduceInternal(ir::IRSchedule &ir_sch,  // NOLINT
-                                       ir::Tensor tmp_out,
-                                       ir::Tensor out,
-                                       const cinn::common::Target &target) {
-  VLOG(3) << "Before IRCudaScheduleBlockReduceInternal : "
+void IRGpuScheduleBlockReduceInternal(ir::IRSchedule &ir_sch,  // NOLINT
+                                      ir::Tensor tmp_out,
+                                      ir::Tensor out,
+                                      const cinn::common::Target &target) {
+  VLOG(3) << "Before IRGpuScheduleBlockReduceInternal : "
           << ir_sch.GetModule().GetExprs().at(0);
   int fuse_times = ir_sch.GetLoops(tmp_out->name).size() - 2;
   for (int idx = 0; idx < fuse_times; ++idx) {
@@ -509,16 +509,16 @@ void IRCudaScheduleBlockReduceInternal(ir::IRSchedule &ir_sch,  // NOLINT
     }
   }
 
-  VLOG(3) << "After IRCudaScheduleBlockReduceInternal : "
+  VLOG(3) << "After IRGpuScheduleBlockReduceInternal : "
           << ir_sch.GetModule().GetExprs().at(0);
 }
 
-void IRCudaScheduleBlockReduce(ir::IRSchedule &ir_sch,  // NOLINT
-                               ir::Tensor reduce_tmp_out,
-                               ir::Tensor tmp_out,
-                               ir::Tensor out,
-                               const cinn::common::Target &target) {
-  VLOG(3) << "Before IRCudaScheduleBlockReduce : "
+void IRGpuScheduleBlockReduce(ir::IRSchedule &ir_sch,  // NOLINT
+                              ir::Tensor reduce_tmp_out,
+                              ir::Tensor tmp_out,
+                              ir::Tensor out,
+                              const cinn::common::Target &target) {
+  VLOG(3) << "Before IRGpuScheduleBlockReduce : "
           << ir_sch.GetModule().GetExprs().at(0);
   int tmp_put_shape_size_without_reduce = 0;
   for (auto i : tmp_out->shape) {
@@ -659,16 +659,16 @@ void IRCudaScheduleBlockReduce(ir::IRSchedule &ir_sch,  // NOLINT
     }
   }
 
-  VLOG(3) << "After IRCudaScheduleBlockReduce : "
+  VLOG(3) << "After IRGpuScheduleBlockReduce : "
           << ir_sch.GetModule().GetExprs().at(0);
 }
 
-void IRCudaScheduleBlockShuffleReduce(ir::IRSchedule &ir_sch,  // NOLINT
-                                      ir::Tensor reshape,
-                                      ir::Tensor internal,
-                                      ir::Tensor reduce_out,
-                                      const cinn::common::Target &target) {
-  VLOG(3) << "Before IRCudaScheduleBlockShuffleReduce : "
+void IRGpuScheduleBlockShuffleReduce(ir::IRSchedule &ir_sch,  // NOLINT
+                                     ir::Tensor reshape,
+                                     ir::Tensor internal,
+                                     ir::Tensor reduce_out,
+                                     const cinn::common::Target &target) {
+  VLOG(3) << "Before IRGpuScheduleBlockShuffleReduce : "
           << ir_sch.GetModule().GetExprs().at(0);
   // reshape compute inline
   {
@@ -921,17 +921,17 @@ void IRCudaScheduleBlockShuffleReduce(ir::IRSchedule &ir_sch,  // NOLINT
       ir_sch.Unroll(r_loops.back());
     }
   }
-  VLOG(3) << "After IRCudaScheduleBlockShuffleReduce : "
+  VLOG(3) << "After IRGpuScheduleBlockShuffleReduce : "
           << ir_sch.GetModule().GetExprs().at(0);
 }
 
-void IRCudaTwoStepReduceSchedule(ir::IRSchedule &ir_sch,  // NOLINT
-                                 ir::Tensor reshape,
-                                 ir::Tensor internal,
-                                 ir::Tensor tmp_out,
-                                 ir::Tensor out,
-                                 const cinn::common::Target &target) {
-  VLOG(3) << "Before IRCudaTwoStepReduceSchedule : "
+void IRGpuTwoStepReduceSchedule(ir::IRSchedule &ir_sch,  // NOLINT
+                                ir::Tensor reshape,
+                                ir::Tensor internal,
+                                ir::Tensor tmp_out,
+                                ir::Tensor out,
+                                const cinn::common::Target &target) {
+  VLOG(3) << "Before IRGpuTwoStepReduceSchedule : "
           << ir_sch.GetModule().GetExprs().at(0);
   // fuse axis
   int fuse_times =
@@ -1038,7 +1038,7 @@ void IRCudaTwoStepReduceSchedule(ir::IRSchedule &ir_sch,  // NOLINT
       }
     }
   }
-  VLOG(3) << "After IRCudaTwoStepReduceSchedule : "
+  VLOG(3) << "After IRGpuTwoStepReduceSchedule : "
           << ir_sch.GetModule().GetExprs().at(0);
   // ir_sch.SimpleComputeAt(ir_sch.GetBlock(tmp_out->name),
   // ir_sch.GetLoops(out->name)[0]);

diff --git a/paddle/cinn/hlir/pe/ir_schedule_pe.h b/paddle/cinn/hlir/pe/ir_schedule_pe.h
@@ -44,11 +44,11 @@ void IRScheduleInjectiveCPU(ir::IRSchedule &ir_sch,  // NOLINT
                             const cinn::common::Target &target,
                             bool vectorizable = true);
 
-void IRCudaScheduleInjective(ir::IRSchedule &ir_sch,  // NOLINT
-                             const std::vector<int> &output_shape,
-                             const cinn::common::Target &target);
+void IRGpuScheduleInjective(ir::IRSchedule &ir_sch,  // NOLINT
+                            const std::vector<int> &output_shape,
+                            const cinn::common::Target &target);
 
-std::vector<cinn::common::CINNValue> IRCudaScheduleMatMul(
+std::vector<cinn::common::CINNValue> IRGpuScheduleMatMul(
     const cinn::common::CINNValuePack &arg_pack,
     const std::vector<int> &output_shape,
     const cinn::common::Target &target);
@@ -66,34 +66,34 @@ void IRCudaSplitSchedule(ir::IRSchedule &ir_sch,  // NOLINT
                          int axis,
                          const cinn::common::Target &target);
 
-void IRCudaScheduleReduce(ir::IRSchedule &ir_sch,  // NOLINT
-                          ir::Tensor out,
-                          int last_dimension_num,
-                          const cinn::common::Target &target);
-
-void IRCudaScheduleBlockReduce(ir::IRSchedule &ir_sch,  // NOLINT
-                               ir::Tensor reduce_tmp_out,
-                               ir::Tensor tmp_out,
-                               ir::Tensor out,
-                               const cinn::common::Target &target);
-
-void IRCudaScheduleBlockReduceInternal(ir::IRSchedule &ir_sch,  // NOLINT
-                                       ir::Tensor tmp_out,
-                                       ir::Tensor out,
-                                       const cinn::common::Target &target);
-
-void IRCudaScheduleBlockShuffleReduce(ir::IRSchedule &ir_sch,  // NOLINT
-                                      ir::Tensor reshape,
-                                      ir::Tensor internal,
+void IRGpuScheduleReduce(ir::IRSchedule &ir_sch,  // NOLINT
+                         ir::Tensor out,
+                         int last_dimension_num,
+                         const cinn::common::Target &target);
+
+void IRGpuScheduleBlockReduce(ir::IRSchedule &ir_sch,  // NOLINT
+                              ir::Tensor reduce_tmp_out,
+                              ir::Tensor tmp_out,
+                              ir::Tensor out,
+                              const cinn::common::Target &target);
+
+void IRGpuScheduleBlockReduceInternal(ir::IRSchedule &ir_sch,  // NOLINT
+                                      ir::Tensor tmp_out,
                                       ir::Tensor out,
                                       const cinn::common::Target &target);
 
-void IRCudaTwoStepReduceSchedule(ir::IRSchedule &ir_sch,  // NOLINT
-                                 ir::Tensor reshape,
-                                 ir::Tensor internal,
-                                 ir::Tensor tmp_out,
-                                 ir::Tensor out,
-                                 const cinn::common::Target &target);
+void IRGpuScheduleBlockShuffleReduce(ir::IRSchedule &ir_sch,  // NOLINT
+                                     ir::Tensor reshape,
+                                     ir::Tensor internal,
+                                     ir::Tensor out,
+                                     const cinn::common::Target &target);
+
+void IRGpuTwoStepReduceSchedule(ir::IRSchedule &ir_sch,  // NOLINT
+                                ir::Tensor reshape,
+                                ir::Tensor internal,
+                                ir::Tensor tmp_out,
+                                ir::Tensor out,
+                                const cinn::common::Target &target);
 
 void IRSoftmaxScheduleCPU(ir::IRSchedule &ir_sch, int axis = -1);  // NOLINT
 

diff --git a/paddle/cinn/optim/eliminate_common_factor_of_local_index.cc b/paddle/cinn/optim/eliminate_common_factor_of_local_index.cc
@@ -239,7 +239,7 @@ std::vector<ir::Expr> CalculateIndexCommonFactor(
     // FLAGS_cinn_bucket_compile=1. However, some unit tests (e.g.
     // test_resnet_cinn, test_instance_norm_op) are still running with the
     // deprecated OpScheduler, and the ir::Expr will break this guarantee after
-    // IRCudaScheduleBlockReduce function. So we have to relax the restriction
+    // IRGpuScheduleBlockReduce function. So we have to relax the restriction
     // here.
     if (indexes[i].size() != indexes[0].size()) {
       LOG(WARNING)