Skip to content

Commit

Permalink
[opt] Remove legacy vectorization pass (taichi-dev#4096)
Browse files Browse the repository at this point in the history
  • Loading branch information
re-xyr committed Jan 24, 2022
1 parent b1f8b08 commit 46e2387
Show file tree
Hide file tree
Showing 31 changed files with 46 additions and 677 deletions.
4 changes: 2 additions & 2 deletions cpp_examples/aot_save.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ void aot_save() {
IRBuilder builder;
auto *zero = builder.get_int32(0);
auto *n_stmt = builder.get_int32(n);
auto *loop = builder.create_range_for(zero, n_stmt, 1, 0, 4);
auto *loop = builder.create_range_for(zero, n_stmt, 0, 4);
{
auto _ = builder.get_loop_guard(loop);
auto *index = builder.get_loop_index(loop);
Expand All @@ -55,7 +55,7 @@ void aot_save() {
*/
IRBuilder builder;
auto *sum = builder.create_local_var(PrimitiveType::i32);
auto *loop = builder.create_struct_for(pointer, 1, 0, 4);
auto *loop = builder.create_struct_for(pointer, 0, 4);
{
auto _ = builder.get_loop_guard(loop);
auto *index = builder.get_loop_index(loop);
Expand Down
6 changes: 3 additions & 3 deletions cpp_examples/autograd.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,7 @@ void autograd() {
auto *zero = builder.get_int32(0);
auto *one = builder.get_int32(1);
auto *n_stmt = builder.get_int32(n);
auto *loop = builder.create_range_for(zero, n_stmt, 1, 0, 4);
auto *loop = builder.create_range_for(zero, n_stmt, 0, 4);
{
auto _ = builder.get_loop_guard(loop);
auto *i = builder.get_loop_index(loop);
Expand All @@ -114,7 +114,7 @@ void autograd() {

auto get_kernel_cal = [&](bool grad) -> Kernel * {
IRBuilder builder;
auto *loop = builder.create_struct_for(a, 1, 0, 4);
auto *loop = builder.create_struct_for(a, 0, 4);
{
auto _ = builder.get_loop_guard(loop);
auto *i = builder.get_loop_index(loop);
Expand All @@ -133,7 +133,7 @@ void autograd() {

{
IRBuilder builder;
auto *loop = builder.create_struct_for(a, 1, 0, 4);
auto *loop = builder.create_struct_for(a, 0, 4);
{
auto _ = builder.get_loop_guard(loop);
auto *i = builder.get_loop_index(loop);
Expand Down
6 changes: 3 additions & 3 deletions cpp_examples/run_snode.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ void run_snode() {
IRBuilder builder;
auto *zero = builder.get_int32(0);
auto *n_stmt = builder.get_int32(n);
auto *loop = builder.create_range_for(zero, n_stmt, 1, 0, 4);
auto *loop = builder.create_range_for(zero, n_stmt, 0, 4);
{
auto _ = builder.get_loop_guard(loop);
auto *index = builder.get_loop_index(loop);
Expand All @@ -87,7 +87,7 @@ void run_snode() {
*/
IRBuilder builder;
auto *sum = builder.create_local_var(PrimitiveType::i32);
auto *loop = builder.create_struct_for(pointer, 1, 0, 4);
auto *loop = builder.create_struct_for(pointer, 0, 4);
{
auto _ = builder.get_loop_guard(loop);
auto *index = builder.get_loop_index(loop);
Expand All @@ -110,7 +110,7 @@ void run_snode() {
# ext = place.to_numpy()
*/
IRBuilder builder;
auto *loop = builder.create_struct_for(pointer, 1, 0, 4);
auto *loop = builder.create_struct_for(pointer, 0, 4);
{
auto _ = builder.get_loop_guard(loop);
auto *index = builder.get_loop_index(loop);
Expand Down
1 change: 0 additions & 1 deletion python/taichi/lang/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -742,7 +742,6 @@ def loop_unique(val, covers=None):

parallelize = _ti_core.parallelize
serialize = lambda: parallelize(1)
vectorize = _ti_core.vectorize
bit_vectorize = _ti_core.bit_vectorize
block_dim = _ti_core.block_dim
global_thread_idx = _ti_core.insert_thread_idx_expr
Expand Down
3 changes: 1 addition & 2 deletions taichi/backends/cc/codegen_cc.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -51,8 +51,7 @@ class CCTransformer : public IRVisitor {
auto ir = kernel_->ir.get();
auto config = kernel_->program->config;
config.demote_dense_struct_fors = true;
irpass::compile_to_executable(ir, config, kernel_,
/*vectorize=*/false, kernel_->grad,
irpass::compile_to_executable(ir, config, kernel_, kernel_->grad,
/*ad_use_stack=*/true, config.print_ir,
/*lower_global_access*/ true);
}
Expand Down
3 changes: 1 addition & 2 deletions taichi/backends/opengl/codegen_opengl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1223,8 +1223,7 @@ void OpenglCodeGen::lower() {
auto ir = kernel_->ir.get();
auto &config = kernel_->program->config;
config.demote_dense_struct_fors = true;
irpass::compile_to_executable(ir, config, kernel_,
/*vectorize=*/false, kernel_->grad,
irpass::compile_to_executable(ir, config, kernel_, kernel_->grad,
/*ad_use_stack=*/false, config.print_ir,
/*lower_global_access=*/true,
/*make_thread_local=*/config.make_thread_local);
Expand Down
3 changes: 1 addition & 2 deletions taichi/codegen/spirv/spirv_codegen.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1804,8 +1804,7 @@ void KernelCodegen::run(TaichiKernelAttributes &kernel_attribs,
void lower(Kernel *kernel) {
auto &config = kernel->program->config;
config.demote_dense_struct_fors = true;
irpass::compile_to_executable(kernel->ir.get(), config, kernel,
/*vectorize=*/false, kernel->grad,
irpass::compile_to_executable(kernel->ir.get(), config, kernel, kernel->grad,
/*ad_use_stack=*/false, config.print_ir,
/*lower_global_access=*/true,
/*make_thread_local=*/false);
Expand Down
12 changes: 0 additions & 12 deletions taichi/ir/frontend_ir.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -38,14 +38,12 @@ IRNode *FrontendContext::root() {
FrontendForStmt::FrontendForStmt(const ExprGroup &loop_var,
const Expr &global_var)
: global_var(global_var) {
vectorize = dec.vectorize;
bit_vectorize = dec.bit_vectorize;
num_cpu_threads = dec.num_cpu_threads;
strictly_serialized = dec.strictly_serialized;
block_dim = dec.block_dim;
auto cfg = get_current_program().config;
if (cfg.arch == Arch::cuda) {
vectorize = 1;
num_cpu_threads = 1;
TI_ASSERT(block_dim <= taichi_max_gpu_block_dim);
} else {
Expand All @@ -55,8 +53,6 @@ FrontendForStmt::FrontendForStmt(const ExprGroup &loop_var,
}
mem_access_opt = dec.mem_access_opt;
dec.reset();
if (vectorize == -1)
vectorize = 1;

loop_var_id.resize(loop_var.size());
for (int i = 0; i < (int)loop_var.size(); i++) {
Expand All @@ -69,13 +65,11 @@ FrontendForStmt::FrontendForStmt(const ExprGroup &loop_var,
const mesh::MeshPtr &mesh,
const mesh::MeshElementType &element_type)
: mesh_for(true), mesh(mesh.ptr.get()), element_type(element_type) {
vectorize = dec.vectorize;
bit_vectorize = dec.bit_vectorize;
num_cpu_threads = dec.num_cpu_threads;
block_dim = dec.block_dim;
auto cfg = get_current_program().config;
if (cfg.arch == Arch::cuda) {
vectorize = 1;
num_cpu_threads = 1;
TI_ASSERT(block_dim <= taichi_max_gpu_block_dim);
} else {
Expand All @@ -85,8 +79,6 @@ FrontendForStmt::FrontendForStmt(const ExprGroup &loop_var,
}
mem_access_opt = dec.mem_access_opt;
dec.reset();
if (vectorize == -1)
vectorize = 1;

loop_var_id.resize(loop_var.size());
for (int i = 0; i < (int)loop_var.size(); i++) {
Expand All @@ -105,23 +97,19 @@ FrontendForStmt::FrontendForStmt(const Expr &loop_var,
const Expr &begin,
const Expr &end)
: begin(begin), end(end) {
vectorize = dec.vectorize;
bit_vectorize = dec.bit_vectorize;
num_cpu_threads = dec.num_cpu_threads;
strictly_serialized = dec.strictly_serialized;
block_dim = dec.block_dim;
auto cfg = get_current_program().config;
if (cfg.arch == Arch::cuda) {
vectorize = 1;
num_cpu_threads = 1;
} else {
if (num_cpu_threads == 0)
num_cpu_threads = std::thread::hardware_concurrency();
}
mem_access_opt = dec.mem_access_opt;
dec.reset();
if (vectorize == -1)
vectorize = 1;
loop_var_id.resize(1);
loop_var_id[0] = loop_var.cast<IdExpression>()->id;
loop_var.expr->ret_type = PrimitiveType::i32;
Expand Down
1 change: 0 additions & 1 deletion taichi/ir/frontend_ir.h
Original file line number Diff line number Diff line change
Expand Up @@ -126,7 +126,6 @@ class FrontendForStmt : public Stmt {
Expr global_var;
std::unique_ptr<Block> body;
std::vector<Identifier> loop_var_id;
int vectorize;
int bit_vectorize;
int num_cpu_threads;
bool strictly_serialized;
Expand Down
1 change: 0 additions & 1 deletion taichi/ir/ir.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,6 @@ std::string snode_access_flag_name(SNodeAccessFlag type) {
}

void DecoratorRecorder::reset() {
vectorize = -1;
bit_vectorize = -1;
num_cpu_threads = 0;
uniform = false;
Expand Down
5 changes: 0 additions & 5 deletions taichi/ir/ir.h
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,6 @@ class MemoryAccessOptions {

class DecoratorRecorder {
public:
int vectorize;
int bit_vectorize;
int num_cpu_threads;
bool strictly_serialized;
Expand Down Expand Up @@ -708,10 +707,6 @@ struct LocalAddress {

extern DecoratorRecorder dec;

inline void Vectorize(int v) {
dec.vectorize = v;
}

inline void BitVectorize(int v) {
dec.bit_vectorize = v;
}
Expand Down
13 changes: 5 additions & 8 deletions taichi/ir/ir_builder.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -85,34 +85,31 @@ IRBuilder::IfGuard::~IfGuard() {

RangeForStmt *IRBuilder::create_range_for(Stmt *begin,
Stmt *end,
int vectorize,
int bit_vectorize,
int num_cpu_threads,
int block_dim,
bool strictly_serialized) {
return insert(Stmt::make_typed<RangeForStmt>(
begin, end, std::make_unique<Block>(), vectorize, bit_vectorize,
num_cpu_threads, block_dim, strictly_serialized));
begin, end, std::make_unique<Block>(), bit_vectorize, num_cpu_threads,
block_dim, strictly_serialized));
}

StructForStmt *IRBuilder::create_struct_for(SNode *snode,
int vectorize,
int bit_vectorize,
int num_cpu_threads,
int block_dim) {
return insert(Stmt::make_typed<StructForStmt>(
snode, std::make_unique<Block>(), vectorize, bit_vectorize,
num_cpu_threads, block_dim));
snode, std::make_unique<Block>(), bit_vectorize, num_cpu_threads,
block_dim));
}

MeshForStmt *IRBuilder::create_mesh_for(mesh::Mesh *mesh,
mesh::MeshElementType element_type,
int vectorize,
int bit_vectorize,
int num_cpu_threads,
int block_dim) {
return insert(Stmt::make_typed<MeshForStmt>(
mesh, element_type, std::make_unique<Block>(), vectorize, bit_vectorize,
mesh, element_type, std::make_unique<Block>(), bit_vectorize,
num_cpu_threads, block_dim));
}

Expand Down
3 changes: 0 additions & 3 deletions taichi/ir/ir_builder.h
Original file line number Diff line number Diff line change
Expand Up @@ -103,19 +103,16 @@ class IRBuilder {
// Control flows.
RangeForStmt *create_range_for(Stmt *begin,
Stmt *end,
int vectorize = -1,
int bit_vectorize = -1,
int num_cpu_threads = 0,
int block_dim = 0,
bool strictly_serialized = false);
StructForStmt *create_struct_for(SNode *snode,
int vectorize = -1,
int bit_vectorize = -1,
int num_cpu_threads = 0,
int block_dim = 0);
MeshForStmt *create_mesh_for(mesh::Mesh *mesh,
mesh::MeshElementType element_type,
int vectorize = -1,
int bit_vectorize = -1,
int num_cpu_threads = 0,
int block_dim = 0);
Expand Down
21 changes: 7 additions & 14 deletions taichi/ir/statements.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -236,7 +236,6 @@ std::unique_ptr<ConstStmt> ConstStmt::copy() {
RangeForStmt::RangeForStmt(Stmt *begin,
Stmt *end,
std::unique_ptr<Block> &&body,
int vectorize,
int bit_vectorize,
int num_cpu_threads,
int block_dim,
Expand All @@ -245,7 +244,6 @@ RangeForStmt::RangeForStmt(Stmt *begin,
: begin(begin),
end(end),
body(std::move(body)),
vectorize(vectorize),
bit_vectorize(bit_vectorize),
num_cpu_threads(num_cpu_threads),
block_dim(block_dim),
Expand All @@ -258,21 +256,19 @@ RangeForStmt::RangeForStmt(Stmt *begin,

std::unique_ptr<Stmt> RangeForStmt::clone() const {
auto new_stmt = std::make_unique<RangeForStmt>(
begin, end, body->clone(), vectorize, bit_vectorize, num_cpu_threads,
block_dim, strictly_serialized);
begin, end, body->clone(), bit_vectorize, num_cpu_threads, block_dim,
strictly_serialized);
new_stmt->reversed = reversed;
return new_stmt;
}

StructForStmt::StructForStmt(SNode *snode,
std::unique_ptr<Block> &&body,
int vectorize,
int bit_vectorize,
int num_cpu_threads,
int block_dim)
: snode(snode),
body(std::move(body)),
vectorize(vectorize),
bit_vectorize(bit_vectorize),
num_cpu_threads(num_cpu_threads),
block_dim(block_dim) {
Expand All @@ -281,24 +277,21 @@ StructForStmt::StructForStmt(SNode *snode,
}

std::unique_ptr<Stmt> StructForStmt::clone() const {
auto new_stmt = std::make_unique<StructForStmt>(snode, body->clone(),
vectorize, bit_vectorize,
num_cpu_threads, block_dim);
auto new_stmt = std::make_unique<StructForStmt>(
snode, body->clone(), bit_vectorize, num_cpu_threads, block_dim);
new_stmt->mem_access_opt = mem_access_opt;
return new_stmt;
}

MeshForStmt::MeshForStmt(mesh::Mesh *mesh,
mesh::MeshElementType element_type,
std::unique_ptr<Block> &&body,
int vectorize,
int bit_vectorize,
int num_cpu_threads,
int block_dim)
: mesh(mesh),
major_from_type(element_type),
body(std::move(body)),
vectorize(vectorize),
bit_vectorize(bit_vectorize),
num_cpu_threads(num_cpu_threads),
block_dim(block_dim) {
Expand All @@ -307,9 +300,9 @@ MeshForStmt::MeshForStmt(mesh::Mesh *mesh,
}

std::unique_ptr<Stmt> MeshForStmt::clone() const {
auto new_stmt = std::make_unique<MeshForStmt>(
mesh, major_from_type, body->clone(), vectorize, bit_vectorize,
num_cpu_threads, block_dim);
auto new_stmt =
std::make_unique<MeshForStmt>(mesh, major_from_type, body->clone(),
bit_vectorize, num_cpu_threads, block_dim);
new_stmt->major_to_types = major_to_types;
new_stmt->minor_relation_types = minor_relation_types;
new_stmt->mem_access_opt = mem_access_opt;
Expand Down
Loading

0 comments on commit 46e2387

Please sign in to comment.