Skip to content

Commit

Permalink
Revert "[Perf]Polish UniformRandom And Split it into ScheduleBlock (P…
Browse files Browse the repository at this point in the history
…addlePaddle#1357)"

This reverts commit 658615e.
  • Loading branch information
Aurelius84 committed May 11, 2023
1 parent 78e0379 commit 413e581
Show file tree
Hide file tree
Showing 10 changed files with 8 additions and 287 deletions.
2 changes: 1 addition & 1 deletion cinn/backends/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ endif()

if (WITH_CUDA)
nv_test(test_codegen_cuda_generate SRCS codegen_cuda_generate_test.cc DEPS cinncore)
nv_test(test_codegen_debug SRCS codegen_debug_test.cc DEPS cinncore cinn_runtime)
nv_test(test_codegen_debug SRCS codegen_debug_test.cc DEPS cinncore)

if (WITH_TESTING)
cc_library(generated1_cuda SRCS generated1.cu DEPS cinncore)
Expand Down
2 changes: 1 addition & 1 deletion cinn/backends/codegen_debug_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -60,9 +60,9 @@ TEST(CodeGenDebug, RunCudaSourceCode) {
common::Context::Global().ResetNameId();

std::string source_code = R"ROC(
extern "C" {
#include "cinn_cuda_runtime_source.cuh"
extern "C" {
#ifdef __CUDACC_RTC__
typedef int int32_t;
Expand Down
12 changes: 1 addition & 11 deletions cinn/backends/extern_func_protos.cc
Original file line number Diff line number Diff line change
Expand Up @@ -27,9 +27,7 @@ ExternFunctionProtoRegistry::ExternFunctionProtoRegistry() {
static const std::vector<std::string> extern_funcs_float_bool_unary = {"isnan", "isfinite", "isinf"};
static const std::vector<std::string> extern_funcs_int_binary = {
"left_shift", "right_shift", "bitwise_or", "bitwise_and", "bitwise_xor", "bitwise_not"};
static const std::vector<std::string> extern_funcs_int_int_unary = {"bitwise_not"};
static const std::vector<std::string> extern_funcs_int_float_call = {"cinn_nvgpu_uniform_random_fp32"};
static const std::vector<std::string> extern_funcs_int_double_call = {"cinn_nvgpu_uniform_random_fp64"};
static const std::vector<std::string> extern_funcs_int_int_unary = {"bitwise_not"};
for (int i = 0; i < extern_funcs_fp32_unary.size(); ++i) {
auto* proto = new FunctionProto(extern_funcs_fp32_unary[i], {Float(32)}, Float(32));
Register(proto->name, proto);
Expand All @@ -46,14 +44,6 @@ ExternFunctionProtoRegistry::ExternFunctionProtoRegistry() {
auto* proto = new FunctionProto(extern_funcs_int_int_unary[i], {Int(32)}, Int(32));
Register(proto->name, proto);
}
for (int i = 0; i < extern_funcs_int_float_call.size(); ++i) {
auto* proto = new FunctionProto(extern_funcs_int_float_call[i], {Int(32)}, Float(32));
Register(proto->name, proto);
}
for (int i = 0; i < extern_funcs_int_double_call.size(); ++i) {
auto* proto = new FunctionProto(extern_funcs_int_double_call[i], {Int(32)}, Float(64));
Register(proto->name, proto);
}

auto* n = detail::CreateTanhVProto();
Register(n->name, n);
Expand Down
3 changes: 0 additions & 3 deletions cinn/hlir/op/contrib/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,3 @@ cc_test(test_repeat SRCS repeat_test.cc DEPS cinncore)
cc_test(test_one_hot SRCS one_hot_test.cc DEPS cinncore)
cc_test(test_lookup_table SRCS lookup_table_test.cc DEPS cinncore)
cc_test(test_reciprocal SRCS reciprocal_test.cc DEPS cinncore)
if (WITH_CUDA)
cc_test(test_uniform_random_gpu SRCS uniform_random_test.cc DEPS cinncore)
endif()
52 changes: 4 additions & 48 deletions cinn/hlir/op/contrib/uniform_random.cc
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "cinn/hlir/op/contrib/uniform_random.h"

#include <gflags/gflags.h>

Expand Down Expand Up @@ -46,43 +45,13 @@
#include "cinn/poly/stage.h"
#include "glog/logging.h"

DECLARE_bool(cinn_ir_schedule);

namespace cinn {
namespace hlir {
namespace op {

using common::CINNValue;
using common::CINNValuePack;

// Only for min = 0. and max = 1.
ir::Tensor UniformRandom(const std::vector<int> &shape,
int seed,
const std::string &dtype,
const Target &target,
const std::string &tensor_name) {
std::string extern_func = "cinn_nvgpu_uniform_random_";
if (target != common::DefaultNVGPUTarget()) {
LOG(FATAL) << "Not Implemented UniformRandom for target: " << target;
}

if (dtype == "float32") {
extern_func += "fp32";
} else if (dtype == "float64") {
extern_func += "fp64";
} else {
LOG(FATAL) << "Not Implemented UniformRandom for dtype: " << dtype;
}

std::vector<Expr> new_shape;
for (auto item : shape) {
new_shape.push_back(Expr(item));
}

return lang::Compute(
new_shape, [=]() { return lang::CallExtern(extern_func, {Expr(seed)}); }, tensor_name);
}

std::shared_ptr<framework::OpStrategy> StrategyForUniformRandom(const framework::NodeAttr &attrs,
const std::vector<ir::Tensor> &inputs,
const std::vector<Type> &out_type,
Expand All @@ -91,22 +60,9 @@ std::shared_ptr<framework::OpStrategy> StrategyForUniformRandom(const framework:
framework::CINNCompute uniform_random_compute([=](lang::Args args, lang::RetValue *ret) {
CHECK(attrs.attr_store.count("shape"));
ir::Tensor shape_tensor;
CHECK(output_shapes.size() == 1UL);
CHECK(attrs.attr_store.count("seed"));
int seed = absl::get<int>(attrs.attr_store.at("seed"));
std::string dtype = "float32";
if (attrs.attr_store.find("dtype") != attrs.attr_store.end()) {
dtype = absl::get<std::string>(attrs.attr_store.at("dtype"));
}
CINNValuePack arg_pack = args[0];
std::string tensor_name = UniqName("uniform_random_out");
if (FLAGS_cinn_ir_schedule) {
CHECK_EQ(arg_pack.size(), 1U);
CHECK(arg_pack[0].is_string());
tensor_name = arg_pack[0].operator std::string();
}
auto out = UniformRandom(output_shapes[0], seed, dtype, target, tensor_name);
auto stages = CreateStages({out});
std::string tensor_name = "uniform_random_out";
auto out = pe::Identity(shape_tensor, tensor_name).front();
auto stages = CreateStages({out});
std::vector<CINNValue> res{CINNValue(out), CINNValue(stages)};
*ret = CINNValuePack{res};
});
Expand Down Expand Up @@ -148,7 +104,7 @@ CINN_REGISTER_HELPER(uniform_random_ops) {
.set_attr<cinn::hlir::framework::StrategyFunction>("CINNStrategy", cinn::hlir::op::StrategyForUniformRandom)
.set_attr("infershape", MakeOpFunction(cinn::hlir::op::InferShapeForUniformRandom))
.set_attr("inferdtype", MakeOpFunction(cinn::hlir::op::InferDtypeForUniformRandom))
.set_attr<cinn::hlir::framework::OpPatternKind>("OpPattern", cinn::hlir::framework::OpPatternKind::kElementWise)
.set_attr<cinn::hlir::framework::OpPatternKind>("OpPattern", cinn::hlir::framework::OpPatternKind::kNonFusible)
.set_support_level(4);

return true;
Expand Down
37 changes: 0 additions & 37 deletions cinn/hlir/op/contrib/uniform_random.h

This file was deleted.

167 changes: 0 additions & 167 deletions cinn/hlir/op/contrib/uniform_random_test.cc

This file was deleted.

1 change: 1 addition & 0 deletions cinn/hlir/op/external_api_registry.cc
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@ CINN_REGISTER_HELPER(op_external_api) {
CINN_OP_REGISTER_EXTERNAL_API(cublas_gemm, default_nvgpu).set_api_name("cinn_call_cublas");
CINN_OP_REGISTER_EXTERNAL_API(cublas_matmul, default_nvgpu).set_api_name("cinn_call_cublas");
CINN_OP_REGISTER_EXTERNAL_API(gaussian_random, default_nvgpu).set_api_name("cinn_call_gaussian_random");
CINN_OP_REGISTER_EXTERNAL_API(uniform_random, default_nvgpu).set_api_name("cinn_call_uniform_random");
CINN_OP_REGISTER_EXTERNAL_API(randint, default_nvgpu).set_api_name("cinn_call_randint");
CINN_OP_REGISTER_EXTERNAL_API(cholesky, default_nvgpu).set_api_name("cinn_call_cholesky_nvgpu");
CINN_OP_REGISTER_EXTERNAL_API(cholesky, default_host).set_api_name("cinn_call_cholesky_host");
Expand Down
18 changes: 0 additions & 18 deletions cinn/runtime/cuda/cinn_cuda_runtime_source.cuh
Original file line number Diff line number Diff line change
@@ -1,10 +1,6 @@
/**
* \file This file contains all the intrinsics available to be used in CUDA code generated by CodeGen.
*/

#include <cuda_runtime.h>
#include <curand_kernel.h>

extern "C" {
// *************************************************************** //
// float32 unary and binary operator
Expand Down Expand Up @@ -346,20 +342,6 @@ __device__ inline bool cinn_any(const bool left, const bool right) { return left
shfl_res = __shfl_down_sync(mask, tmp_val, offset, 32); \
tmp_val = op((threadIdx.x & 0x1f) + offset < lane ? shfl_res : init, tmp_val);

__device__ inline float cinn_nvgpu_uniform_random_fp32(int seed){
curandStatePhilox4_32_10_t state;
int idx = threadIdx.x + blockIdx.x * blockDim.x;
curand_init(seed, idx, 1, &state);
return curand_uniform(&state);
}

__device__ inline double cinn_nvgpu_uniform_random_fp64(int seed){
curandStatePhilox4_32_10_t state;
int idx = threadIdx.x + blockIdx.x * blockDim.x;
curand_init(seed, idx, 1, &state);
return curand_uniform_double(&state);
}

#define CINN_WARP_SHUFFLE_INTERNAL_IMPL(REDUCE_TYPE, INITIAL_VALUE, DTYPE) \
__device__ inline DTYPE cinn_warp_shuffle_##REDUCE_TYPE##_internal(const DTYPE value) { \
DTYPE tmp_val = value, shfl_res; \
Expand Down
1 change: 0 additions & 1 deletion cinn/utils/data_util.cc
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,6 @@ std::vector<T> GetTensorData(const hlir::framework::Tensor& tensor, const common
}

template std::vector<float> GetTensorData<float>(const hlir::framework::Tensor& tensor, const common::Target& target);
template std::vector<double> GetTensorData<double>(const hlir::framework::Tensor& tensor, const common::Target& target);
template std::vector<int> GetTensorData<int>(const hlir::framework::Tensor& tensor, const common::Target& target);

} // namespace cinn

0 comments on commit 413e581

Please sign in to comment.