Revert "[Perf]Polish UniformRandom And Split it into ScheduleBlock (P…

…addlePaddle#1357)" This reverts commit 658615e.
Aurelius84 · May 11, 2023 · 413e581 · 413e581
1 parent 78e0379
commit 413e581
Show file tree

Hide file tree

Showing 10 changed files with 8 additions and 287 deletions.
diff --git a/cinn/backends/CMakeLists.txt b/cinn/backends/CMakeLists.txt
@@ -40,7 +40,7 @@ endif()
 
 if (WITH_CUDA)
   nv_test(test_codegen_cuda_generate SRCS codegen_cuda_generate_test.cc DEPS cinncore)
-  nv_test(test_codegen_debug SRCS codegen_debug_test.cc DEPS cinncore cinn_runtime)
+  nv_test(test_codegen_debug SRCS codegen_debug_test.cc DEPS cinncore)
 
   if (WITH_TESTING)
     cc_library(generated1_cuda SRCS generated1.cu DEPS cinncore)

diff --git a/cinn/backends/codegen_debug_test.cc b/cinn/backends/codegen_debug_test.cc
@@ -60,9 +60,9 @@ TEST(CodeGenDebug, RunCudaSourceCode) {
   common::Context::Global().ResetNameId();
 
   std::string source_code = R"ROC(
+extern "C" {
 
 #include "cinn_cuda_runtime_source.cuh"
-extern "C" {
 
 #ifdef __CUDACC_RTC__
 typedef int int32_t;

diff --git a/cinn/backends/extern_func_protos.cc b/cinn/backends/extern_func_protos.cc
@@ -27,9 +27,7 @@ ExternFunctionProtoRegistry::ExternFunctionProtoRegistry() {
   static const std::vector<std::string> extern_funcs_float_bool_unary = {"isnan", "isfinite", "isinf"};
   static const std::vector<std::string> extern_funcs_int_binary       = {
       "left_shift", "right_shift", "bitwise_or", "bitwise_and", "bitwise_xor", "bitwise_not"};
-  static const std::vector<std::string> extern_funcs_int_int_unary   = {"bitwise_not"};
-  static const std::vector<std::string> extern_funcs_int_float_call  = {"cinn_nvgpu_uniform_random_fp32"};
-  static const std::vector<std::string> extern_funcs_int_double_call = {"cinn_nvgpu_uniform_random_fp64"};
+  static const std::vector<std::string> extern_funcs_int_int_unary = {"bitwise_not"};
   for (int i = 0; i < extern_funcs_fp32_unary.size(); ++i) {
     auto* proto = new FunctionProto(extern_funcs_fp32_unary[i], {Float(32)}, Float(32));
     Register(proto->name, proto);
@@ -46,14 +44,6 @@ ExternFunctionProtoRegistry::ExternFunctionProtoRegistry() {
     auto* proto = new FunctionProto(extern_funcs_int_int_unary[i], {Int(32)}, Int(32));
     Register(proto->name, proto);
   }
-  for (int i = 0; i < extern_funcs_int_float_call.size(); ++i) {
-    auto* proto = new FunctionProto(extern_funcs_int_float_call[i], {Int(32)}, Float(32));
-    Register(proto->name, proto);
-  }
-  for (int i = 0; i < extern_funcs_int_double_call.size(); ++i) {
-    auto* proto = new FunctionProto(extern_funcs_int_double_call[i], {Int(32)}, Float(64));
-    Register(proto->name, proto);
-  }
 
   auto* n = detail::CreateTanhVProto();
   Register(n->name, n);

diff --git a/cinn/hlir/op/contrib/CMakeLists.txt b/cinn/hlir/op/contrib/CMakeLists.txt
@@ -31,6 +31,3 @@ cc_test(test_repeat SRCS repeat_test.cc DEPS cinncore)
 cc_test(test_one_hot SRCS one_hot_test.cc DEPS cinncore)
 cc_test(test_lookup_table SRCS lookup_table_test.cc DEPS cinncore)
 cc_test(test_reciprocal SRCS reciprocal_test.cc DEPS cinncore)
-if (WITH_CUDA)
-   cc_test(test_uniform_random_gpu SRCS uniform_random_test.cc DEPS cinncore)
-endif()
diff --git a/cinn/hlir/op/contrib/uniform_random.cc b/cinn/hlir/op/contrib/uniform_random.cc
@@ -11,7 +11,6 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include "cinn/hlir/op/contrib/uniform_random.h"
 
 #include <gflags/gflags.h>
 
@@ -46,43 +45,13 @@
 #include "cinn/poly/stage.h"
 #include "glog/logging.h"
 
-DECLARE_bool(cinn_ir_schedule);
-
 namespace cinn {
 namespace hlir {
 namespace op {
 
 using common::CINNValue;
 using common::CINNValuePack;
 
-// Only for min = 0. and max = 1.
-ir::Tensor UniformRandom(const std::vector<int> &shape,
-                         int seed,
-                         const std::string &dtype,
-                         const Target &target,
-                         const std::string &tensor_name) {
-  std::string extern_func = "cinn_nvgpu_uniform_random_";
-  if (target != common::DefaultNVGPUTarget()) {
-    LOG(FATAL) << "Not Implemented UniformRandom for target: " << target;
-  }
-
-  if (dtype == "float32") {
-    extern_func += "fp32";
-  } else if (dtype == "float64") {
-    extern_func += "fp64";
-  } else {
-    LOG(FATAL) << "Not Implemented UniformRandom for dtype: " << dtype;
-  }
-
-  std::vector<Expr> new_shape;
-  for (auto item : shape) {
-    new_shape.push_back(Expr(item));
-  }
-
-  return lang::Compute(
-      new_shape, [=]() { return lang::CallExtern(extern_func, {Expr(seed)}); }, tensor_name);
-}
-
 std::shared_ptr<framework::OpStrategy> StrategyForUniformRandom(const framework::NodeAttr &attrs,
                                                                 const std::vector<ir::Tensor> &inputs,
                                                                 const std::vector<Type> &out_type,
@@ -91,22 +60,9 @@ std::shared_ptr<framework::OpStrategy> StrategyForUniformRandom(const framework:
   framework::CINNCompute uniform_random_compute([=](lang::Args args, lang::RetValue *ret) {
     CHECK(attrs.attr_store.count("shape"));
     ir::Tensor shape_tensor;
-    CHECK(output_shapes.size() == 1UL);
-    CHECK(attrs.attr_store.count("seed"));
-    int seed          = absl::get<int>(attrs.attr_store.at("seed"));
-    std::string dtype = "float32";
-    if (attrs.attr_store.find("dtype") != attrs.attr_store.end()) {
-      dtype = absl::get<std::string>(attrs.attr_store.at("dtype"));
-    }
-    CINNValuePack arg_pack  = args[0];
-    std::string tensor_name = UniqName("uniform_random_out");
-    if (FLAGS_cinn_ir_schedule) {
-      CHECK_EQ(arg_pack.size(), 1U);
-      CHECK(arg_pack[0].is_string());
-      tensor_name = arg_pack[0].operator std::string();
-    }
-    auto out    = UniformRandom(output_shapes[0], seed, dtype, target, tensor_name);
-    auto stages = CreateStages({out});
+    std::string tensor_name = "uniform_random_out";
+    auto out                = pe::Identity(shape_tensor, tensor_name).front();
+    auto stages             = CreateStages({out});
     std::vector<CINNValue> res{CINNValue(out), CINNValue(stages)};
     *ret = CINNValuePack{res};
   });
@@ -148,7 +104,7 @@ CINN_REGISTER_HELPER(uniform_random_ops) {
       .set_attr<cinn::hlir::framework::StrategyFunction>("CINNStrategy", cinn::hlir::op::StrategyForUniformRandom)
       .set_attr("infershape", MakeOpFunction(cinn::hlir::op::InferShapeForUniformRandom))
       .set_attr("inferdtype", MakeOpFunction(cinn::hlir::op::InferDtypeForUniformRandom))
-      .set_attr<cinn::hlir::framework::OpPatternKind>("OpPattern", cinn::hlir::framework::OpPatternKind::kElementWise)
+      .set_attr<cinn::hlir::framework::OpPatternKind>("OpPattern", cinn::hlir::framework::OpPatternKind::kNonFusible)
       .set_support_level(4);
 
   return true;

diff --git a/cinn/hlir/op/contrib/uniform_random.h b/cinn/hlir/op/contrib/uniform_random.h
diff --git a/cinn/hlir/op/contrib/uniform_random_test.cc b/cinn/hlir/op/contrib/uniform_random_test.cc
diff --git a/cinn/hlir/op/external_api_registry.cc b/cinn/hlir/op/external_api_registry.cc
@@ -55,6 +55,7 @@ CINN_REGISTER_HELPER(op_external_api) {
   CINN_OP_REGISTER_EXTERNAL_API(cublas_gemm, default_nvgpu).set_api_name("cinn_call_cublas");
   CINN_OP_REGISTER_EXTERNAL_API(cublas_matmul, default_nvgpu).set_api_name("cinn_call_cublas");
   CINN_OP_REGISTER_EXTERNAL_API(gaussian_random, default_nvgpu).set_api_name("cinn_call_gaussian_random");
+  CINN_OP_REGISTER_EXTERNAL_API(uniform_random, default_nvgpu).set_api_name("cinn_call_uniform_random");
   CINN_OP_REGISTER_EXTERNAL_API(randint, default_nvgpu).set_api_name("cinn_call_randint");
   CINN_OP_REGISTER_EXTERNAL_API(cholesky, default_nvgpu).set_api_name("cinn_call_cholesky_nvgpu");
   CINN_OP_REGISTER_EXTERNAL_API(cholesky, default_host).set_api_name("cinn_call_cholesky_host");

diff --git a/cinn/runtime/cuda/cinn_cuda_runtime_source.cuh b/cinn/runtime/cuda/cinn_cuda_runtime_source.cuh
@@ -1,10 +1,6 @@
 /**
  * \file This file contains all the intrinsics available to be used in CUDA code generated by CodeGen.
  */
-
-#include <cuda_runtime.h>
-#include <curand_kernel.h>
-
 extern "C" {
 // *************************************************************** //
 // float32 unary and binary operator
@@ -346,20 +342,6 @@ __device__ inline bool cinn_any(const bool left, const bool right) { return left
   shfl_res = __shfl_down_sync(mask, tmp_val, offset, 32); \
   tmp_val  = op((threadIdx.x & 0x1f) + offset < lane ? shfl_res : init, tmp_val);
 
-__device__ inline float cinn_nvgpu_uniform_random_fp32(int seed){
-  curandStatePhilox4_32_10_t state;
-  int idx = threadIdx.x + blockIdx.x * blockDim.x;
-  curand_init(seed, idx, 1, &state);
-  return curand_uniform(&state);
-}
-
-__device__ inline double cinn_nvgpu_uniform_random_fp64(int seed){
-  curandStatePhilox4_32_10_t state;
-  int idx = threadIdx.x + blockIdx.x * blockDim.x;
-  curand_init(seed, idx, 1, &state);
-  return curand_uniform_double(&state);
-}
-
 #define CINN_WARP_SHUFFLE_INTERNAL_IMPL(REDUCE_TYPE, INITIAL_VALUE, DTYPE)                \
   __device__ inline DTYPE cinn_warp_shuffle_##REDUCE_TYPE##_internal(const DTYPE value) { \
     DTYPE tmp_val     = value, shfl_res;                                                  \

diff --git a/cinn/utils/data_util.cc b/cinn/utils/data_util.cc
@@ -116,7 +116,6 @@ std::vector<T> GetTensorData(const hlir::framework::Tensor& tensor, const common
 }
 
 template std::vector<float> GetTensorData<float>(const hlir::framework::Tensor& tensor, const common::Target& target);
-template std::vector<double> GetTensorData<double>(const hlir::framework::Tensor& tensor, const common::Target& target);
 template std::vector<int> GetTensorData<int>(const hlir::framework::Tensor& tensor, const common::Target& target);
 
 }  // namespace cinn