From 803980241a42ce5ca10f2099abb206e352c1b093 Mon Sep 17 00:00:00 2001 From: Josh Herrera Date: Wed, 25 May 2022 10:26:28 -0700 Subject: [PATCH 1/7] Working 8 bit vlut for relay take operator --- python/tvm/topi/hexagon/injective.py | 6 ++ src/target/llvm/codegen_hexagon.cc | 148 +++++++++++++++++++++++++++ 2 files changed, 154 insertions(+) diff --git a/python/tvm/topi/hexagon/injective.py b/python/tvm/topi/hexagon/injective.py index 34a9fb9a05e5..88b7d94a33b6 100644 --- a/python/tvm/topi/hexagon/injective.py +++ b/python/tvm/topi/hexagon/injective.py @@ -37,6 +37,12 @@ def schedule_injective(outs): outs = [outs] if isinstance(outs, tvm.te.tensor.Tensor) else outs s = tvm.te.create_schedule([x.op for x in outs]) tvm.te.schedule.AutoInlineInjective(s) + + # Fuse axes and vectorize inner 128 elements + for x in outs: + fused = s[x].fuse(*x.op.axis) + outer, inner = s[x].split(fused, factor=128) + s[x].vectorize(inner) return s diff --git a/src/target/llvm/codegen_hexagon.cc b/src/target/llvm/codegen_hexagon.cc index a195c9f05453..92b3bdc45198 100644 --- a/src/target/llvm/codegen_hexagon.cc +++ b/src/target/llvm/codegen_hexagon.cc @@ -74,8 +74,19 @@ class CodeGenHexagon final : public CodeGenCPU { bool system_lib, bool dynamic_lookup, bool target_c_runtime) override; void InitTarget(llvm::TargetMachine* tm) final; + using CodeGenCPU::VisitStmt_; + llvm::Value* VisitExpr_(const BufferLoadNode* op) override; + llvm::Module* GetModulePtr() const { return module_.get(); } + uint64_t GetTypeSizeInBits(llvm::Type* type) const { +#if TVM_LLVM_VERSION >= 100 + return data_layout_->getTypeSizeInBits(type).getFixedValue(); +#else + return data_layout_->getTypeSizeInBits(type); +#endif + } + protected: void CreatePrintf(const std::string& format, llvm::ArrayRef format_args) final; @@ -86,6 +97,10 @@ class CodeGenHexagon final : public CodeGenCPU { llvm::GlobalVariable* InitContextPtr(llvm::Type* type, std::string name); llvm::Value* GetContextPtr(llvm::GlobalVariable* gv); + + llvm::Value* VectorLookupLoad(Buffer buffer, DataType buffer_type, Array index); + llvm::Value* Intrinsic(llvm::Intrinsic::ID, llvm::ArrayRef args); + }; void CodeGenHexagon::Init(const std::string& module_name, llvm::TargetMachine* tm, @@ -281,6 +296,139 @@ CodeGenLLVM::TypedPointer CodeGenHexagon::CreateStructRefPtr(DataType t, llvm::V return TypedPointer(); } +llvm::Value* CodeGenHexagon::Intrinsic(llvm::Intrinsic::ID IntID, + llvm::ArrayRef args) { + llvm::Function* intf = llvm::Intrinsic::getDeclaration(module_.get(), IntID); +#if TVM_LLVM_VERSION >= 90 + auto intf_callee = llvm::FunctionCallee(intf); +#else + auto intf_callee = intf; +#endif + std::vector conv_args; + llvm::FunctionType* intf_type = intf->getFunctionType(); + ICHECK(args.size() == intf_type->getNumParams()); + + for (int i = 0, e = args.size(); i != e; ++i) { + llvm::Value* arg = args[i]; + auto* need_type = llvm::dyn_cast(intf_type->getParamType(i)); + auto* have_type = llvm::dyn_cast(arg->getType()); + if (need_type != nullptr && have_type != nullptr && need_type != have_type) { + int need_width = GetTypeSizeInBits(need_type); + int have_width = GetTypeSizeInBits(have_type); + if (need_width == have_width) { + if (need_width == native_vector_bits_ || need_width == 2 * native_vector_bits_) { + arg = builder_->CreateBitCast(arg, need_type); + } + } // TODO: add handling of v128i1 <-> v1024i1 + } + conv_args.push_back(arg); + } + return builder_->CreateCall(intf_callee, conv_args); +} + +llvm::Value* CodeGenHexagon::VisitExpr_(const BufferLoadNode* op) { + if (!op->buffer.same_as(op->buffer->data)) { + // Check if we can generate a vector lookup. + if (!op->indices[0].as()) { + if (auto* vlut = VectorLookupLoad(op->buffer, op->dtype, op->indices)) { + return vlut; + } + } + return CodeGenLLVM::VisitExpr_(op); + } +} + +llvm::Value* CodeGenHexagon::VectorLookupLoad(Buffer buffer, DataType buffer_type, + Array indices) { + PrimExpr index = indices[0]; + if (!index.dtype().is_vector()) { + return nullptr; + } + + if (buffer_type.bits() != 8) return nullptr; + + int table_elem_count = arith::Analyzer().Simplify(buffer->shape[0]).as()->value; + if (table_elem_count <= 0 || table_elem_count > 256) return nullptr; + + auto int32 = DataType::Int(32); + auto native_vector_bytes = native_vector_bits_ / 8; + + // Indexes + llvm::Value* trunc = MakeValue(Cast(index.dtype().with_bits(8), index)); + llvm::Value* index_pad = CreateVecPad(trunc, native_vector_bytes); + + // Values + std::vector vloads; + DataType table_type = buffer_type.with_lanes(table_elem_count); + + auto table_all = MakeValue(BufferLoad(buffer, + {Ramp(IntImm(int32, 0), IntImm(int32, 1), table_elem_count),})); + + // The number of value vectors should be a power of 2. + int table_vec_count = llvm::PowerOf2Ceil(GetVectorBytes(table_type) / native_vector_bytes); + int table_vec_length = native_vector_bytes / buffer_type.bytes(); + for (int i = 0; i != table_vec_count; ++i) { + // CreateVecSlice will generate undefs for elements outside the source vector. + vloads.push_back(CreateVecSlice(table_all, i * table_vec_length, table_vec_length)); + } + +#define VLO(x) Intrinsic(llvm::Intrinsic::hexagon_V6_lo_128B, {x}) +#define VHI(x) Intrinsic(llvm::Intrinsic::hexagon_V6_hi_128B, {x}) +#define VXOR(x, y) Intrinsic(llvm::Intrinsic::hexagon_V6_vxor_128B, {x, y}) +#define VSHUFF(x) Intrinsic(llvm::Intrinsic::hexagon_V6_vshuffb_128B, {x}) +#define VSPLATB(x) Intrinsic(llvm::Intrinsic::hexagon_V6_lvsplatb_128B, {x}) +#define VLUT32(x, y, z) Intrinsic(llvm::Intrinsic::hexagon_V6_vlutvvbi_128B, {x, y, z}) +#define VLUT32_OR(v, x, y, z) \ + Intrinsic(llvm::Intrinsic::hexagon_V6_vlutvvb_oracci_128B, {v, x, y, z}) + + // Shuffle table bytes: + // 127, 63, 126, 62,........68, 4, 67, 3, 66, 2, 65, 1, 64, 0 + std::vector table; + for (int i = 0; i != table_vec_count; ++i) + table.push_back(VSHUFF(vloads[i])); + + // Get each 32 byte sub-table's output + std::vector results; + int table_iters = table_elem_count / 32; + for (int i = 0; i < table_iters; ++i) + results.push_back(VLUT32(index_pad, table[i/4], ConstInt32(i%8))); + + // Combine outputs + llvm::Value* result = results[0]; + for (int i = 1; i < table_iters; ++i) + result = VXOR(result, results[i]); + + llvm::Type* res_type = result->getType(); + llvm::Type* ret_type = DTypeToLLVMType(buffer_type); + if (res_type == ret_type) { + return result; + } + + int res_bits = GetTypeSizeInBits(res_type); + int ret_bits = GetTypeSizeInBits(ret_type); + ICHECK_GE(res_bits, ret_bits); + if (ret_bits < res_bits) { +#if TVM_LLVM_VERSION >= 110 + llvm::Type* res_byte_type = llvm::VectorType::get(t_int8_, res_bits / 8, /*Scalable*/ false); +#else + llvm::Type* res_byte_type = llvm::VectorType::get(t_int8_, res_bits / 8); +#endif + result = CreateVecSlice(builder_->CreateBitCast(result, res_byte_type), 0, ret_bits / 8); + } + if (result->getType() != ret_type) { + return builder_->CreateBitCast(result, ret_type); + } + return result; + +#undef VLUT32_OR +#undef VLUT32 +#undef VSPLATB +#undef VSHUFF +#undef VXOR +#undef VHI +#undef VLO +} + namespace { DMLC_ATTRIBUTE_UNUSED std::ostream& operator<<(std::ostream& os, const llvm::Module& m) { std::string ms; From 7181d7618b7d35bb3f98e498d17758806b70780e Mon Sep 17 00:00:00 2001 From: Josh Herrera Date: Thu, 23 Jun 2022 08:18:10 -0700 Subject: [PATCH 2/7] Formatting --- python/tvm/topi/hexagon/injective.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/tvm/topi/hexagon/injective.py b/python/tvm/topi/hexagon/injective.py index 88b7d94a33b6..9ced0ac7d399 100644 --- a/python/tvm/topi/hexagon/injective.py +++ b/python/tvm/topi/hexagon/injective.py @@ -41,7 +41,7 @@ def schedule_injective(outs): # Fuse axes and vectorize inner 128 elements for x in outs: fused = s[x].fuse(*x.op.axis) - outer, inner = s[x].split(fused, factor=128) + _, inner = s[x].split(fused, factor=128) s[x].vectorize(inner) return s From 035bbf31acf37d5bdad0f72f2e2f0e6d3ddcd709 Mon Sep 17 00:00:00 2001 From: Josh Herrera Date: Thu, 23 Jun 2022 14:03:44 -0700 Subject: [PATCH 3/7] More formatting --- src/target/llvm/codegen_hexagon.cc | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/src/target/llvm/codegen_hexagon.cc b/src/target/llvm/codegen_hexagon.cc index 92b3bdc45198..153f064b1751 100644 --- a/src/target/llvm/codegen_hexagon.cc +++ b/src/target/llvm/codegen_hexagon.cc @@ -100,7 +100,6 @@ class CodeGenHexagon final : public CodeGenCPU { llvm::Value* VectorLookupLoad(Buffer buffer, DataType buffer_type, Array index); llvm::Value* Intrinsic(llvm::Intrinsic::ID, llvm::ArrayRef args); - }; void CodeGenHexagon::Init(const std::string& module_name, llvm::TargetMachine* tm, @@ -319,7 +318,7 @@ llvm::Value* CodeGenHexagon::Intrinsic(llvm::Intrinsic::ID IntID, if (need_width == native_vector_bits_ || need_width == 2 * native_vector_bits_) { arg = builder_->CreateBitCast(arg, need_type); } - } // TODO: add handling of v128i1 <-> v1024i1 + } // TODO(joshherr-quic): add handling of v128i1 <-> v1024i1 } conv_args.push_back(arg); } @@ -361,8 +360,8 @@ llvm::Value* CodeGenHexagon::VectorLookupLoad(Buffer buffer, DataType buffer_typ std::vector vloads; DataType table_type = buffer_type.with_lanes(table_elem_count); - auto table_all = MakeValue(BufferLoad(buffer, - {Ramp(IntImm(int32, 0), IntImm(int32, 1), table_elem_count),})); + auto table_all = MakeValue(BufferLoad(buffer, + {Ramp(IntImm(int32, 0), IntImm(int32, 1), table_elem_count), })); // The number of value vectors should be a power of 2. int table_vec_count = llvm::PowerOf2Ceil(GetVectorBytes(table_type) / native_vector_bytes); From 550c7e62ba8cc01093126944104fff2e74c218dd Mon Sep 17 00:00:00 2001 From: Josh Herrera Date: Fri, 24 Jun 2022 13:34:49 -0700 Subject: [PATCH 4/7] clang-format on codegen_hexagon.cc --- src/target/llvm/codegen_hexagon.cc | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/src/target/llvm/codegen_hexagon.cc b/src/target/llvm/codegen_hexagon.cc index 153f064b1751..f998c649deb5 100644 --- a/src/target/llvm/codegen_hexagon.cc +++ b/src/target/llvm/codegen_hexagon.cc @@ -360,8 +360,10 @@ llvm::Value* CodeGenHexagon::VectorLookupLoad(Buffer buffer, DataType buffer_typ std::vector vloads; DataType table_type = buffer_type.with_lanes(table_elem_count); - auto table_all = MakeValue(BufferLoad(buffer, - {Ramp(IntImm(int32, 0), IntImm(int32, 1), table_elem_count), })); + auto table_all = + MakeValue(BufferLoad(buffer, { + Ramp(IntImm(int32, 0), IntImm(int32, 1), table_elem_count), + })); // The number of value vectors should be a power of 2. int table_vec_count = llvm::PowerOf2Ceil(GetVectorBytes(table_type) / native_vector_bytes); @@ -383,19 +385,17 @@ llvm::Value* CodeGenHexagon::VectorLookupLoad(Buffer buffer, DataType buffer_typ // Shuffle table bytes: // 127, 63, 126, 62,........68, 4, 67, 3, 66, 2, 65, 1, 64, 0 std::vector table; - for (int i = 0; i != table_vec_count; ++i) - table.push_back(VSHUFF(vloads[i])); + for (int i = 0; i != table_vec_count; ++i) table.push_back(VSHUFF(vloads[i])); // Get each 32 byte sub-table's output std::vector results; int table_iters = table_elem_count / 32; for (int i = 0; i < table_iters; ++i) - results.push_back(VLUT32(index_pad, table[i/4], ConstInt32(i%8))); + results.push_back(VLUT32(index_pad, table[i / 4], ConstInt32(i % 8))); // Combine outputs llvm::Value* result = results[0]; - for (int i = 1; i < table_iters; ++i) - result = VXOR(result, results[i]); + for (int i = 1; i < table_iters; ++i) result = VXOR(result, results[i]); llvm::Type* res_type = result->getType(); llvm::Type* ret_type = DTypeToLLVMType(buffer_type); From 6e10636255c45e2edd25d0bca5258aa5e08e2563 Mon Sep 17 00:00:00 2001 From: Josh Herrera Date: Tue, 28 Jun 2022 13:32:21 -0700 Subject: [PATCH 5/7] Update for llvm api --- src/target/llvm/codegen_hexagon.cc | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/target/llvm/codegen_hexagon.cc b/src/target/llvm/codegen_hexagon.cc index f998c649deb5..a7ce2c79df87 100644 --- a/src/target/llvm/codegen_hexagon.cc +++ b/src/target/llvm/codegen_hexagon.cc @@ -80,7 +80,9 @@ class CodeGenHexagon final : public CodeGenCPU { llvm::Module* GetModulePtr() const { return module_.get(); } uint64_t GetTypeSizeInBits(llvm::Type* type) const { -#if TVM_LLVM_VERSION >= 100 +#if TVM_LLVM_VERSION >= 110 + return data_layout_->getTypeSizeInBits(type).getFixedSize(); +#elif TVM_LLVM_VERSION >= 100 return data_layout_->getTypeSizeInBits(type).getFixedValue(); #else return data_layout_->getTypeSizeInBits(type); From 88a13f796a7ff675d9e3bc110625d736e6fb28b3 Mon Sep 17 00:00:00 2001 From: Josh Herrera Date: Thu, 30 Jun 2022 08:31:59 -0700 Subject: [PATCH 6/7] Add return to VisitExpr(BufferLoadNode) function --- src/target/llvm/codegen_hexagon.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/target/llvm/codegen_hexagon.cc b/src/target/llvm/codegen_hexagon.cc index a7ce2c79df87..2550fe1469f4 100644 --- a/src/target/llvm/codegen_hexagon.cc +++ b/src/target/llvm/codegen_hexagon.cc @@ -335,8 +335,8 @@ llvm::Value* CodeGenHexagon::VisitExpr_(const BufferLoadNode* op) { return vlut; } } - return CodeGenLLVM::VisitExpr_(op); } + return CodeGenLLVM::VisitExpr_(op); } llvm::Value* CodeGenHexagon::VectorLookupLoad(Buffer buffer, DataType buffer_type, From 2d3a6b5f81322b2e62267fd7efa4603aedc7ade8 Mon Sep 17 00:00:00 2001 From: Josh Herrera Date: Thu, 30 Jun 2022 09:22:07 -0700 Subject: [PATCH 7/7] different llvm api --- src/target/llvm/codegen_hexagon.cc | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/target/llvm/codegen_hexagon.cc b/src/target/llvm/codegen_hexagon.cc index 2550fe1469f4..7b0081869a27 100644 --- a/src/target/llvm/codegen_hexagon.cc +++ b/src/target/llvm/codegen_hexagon.cc @@ -80,10 +80,8 @@ class CodeGenHexagon final : public CodeGenCPU { llvm::Module* GetModulePtr() const { return module_.get(); } uint64_t GetTypeSizeInBits(llvm::Type* type) const { -#if TVM_LLVM_VERSION >= 110 +#if TVM_LLVM_VERSION >= 100 return data_layout_->getTypeSizeInBits(type).getFixedSize(); -#elif TVM_LLVM_VERSION >= 100 - return data_layout_->getTypeSizeInBits(type).getFixedValue(); #else return data_layout_->getTypeSizeInBits(type); #endif