From a7d144531bb7553d09fff851e3335991e9260e13 Mon Sep 17 00:00:00 2001 From: Josh Fromm Date: Tue, 23 Aug 2022 13:55:37 -0700 Subject: [PATCH 1/6] Change default alignment to 64 bits. --- include/tvm/runtime/device_api.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/tvm/runtime/device_api.h b/include/tvm/runtime/device_api.h index c3d83bf2993f..1bb10fa17ae6 100644 --- a/include/tvm/runtime/device_api.h +++ b/include/tvm/runtime/device_api.h @@ -52,10 +52,10 @@ enum DeviceAttrKind : int { }; /*! \brief Number of bytes each allocation must align to */ -constexpr int kAllocAlignment = 128; +constexpr int kAllocAlignment = 64; /*! \brief Number of bytes each allocation must align to in temporary allocation */ -constexpr int kTempAllocaAlignment = 128; +constexpr int kTempAllocaAlignment = 64; /*! \brief Maximum size that can be allocated on stack */ constexpr int kMaxStackAlloca = 1024; From 40b866d8e0a0b88472e57aa35e1da128d4e11f33 Mon Sep 17 00:00:00 2001 From: Josh Fromm Date: Tue, 23 Aug 2022 14:01:51 -0700 Subject: [PATCH 2/6] Run dlpack test a few times. --- tests/python/contrib/test_dlpack.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/tests/python/contrib/test_dlpack.py b/tests/python/contrib/test_dlpack.py index c71fc45d0346..4e65f79c518e 100644 --- a/tests/python/contrib/test_dlpack.py +++ b/tests/python/contrib/test_dlpack.py @@ -21,7 +21,7 @@ from tvm.contrib.dlpack import to_pytorch_func -def test(): +def verify_torch_dlpack(): a = np.random.randn(1337) tvm_a = tvm.nd.array(a) np.testing.assert_equal(tvm.nd.from_dlpack(tvm_a.to_dlpack()).numpy(), a) @@ -63,5 +63,11 @@ def test(): pass +def test_torch_dlpack(): + # Run dlpack interoperability test a few times to make sure it's stable. + for i in range(5): + verify_torch_dlpack() + + if __name__ == "__main__": - test() + test_torch_dlpack() From a99cb246af9196b049225e11a7318197e32f3229 Mon Sep 17 00:00:00 2001 From: Josh Fromm Date: Tue, 23 Aug 2022 15:29:32 -0700 Subject: [PATCH 3/6] Update alignment in tests. --- python/tvm/tir/schedule/schedule.py | 12 +- python/tvm/tir/tensor_intrin/cuda.py | 54 ++++--- .../test_ethosu/test_tir_to_cs_translator.py | 68 ++++----- .../contrib/test_ethosu/test_vela_api.py | 18 +-- .../test_tir_analysis_calculate_workspace.py | 16 +- tests/python/unittest/test_tir_intrin.py | 8 +- .../unittest/test_tir_schedule_analysis.py | 6 +- .../unittest/test_tir_schedule_reduction.py | 16 +- .../test_tir_schedule_storage_align.py | 18 +-- .../unittest/test_tir_schedule_tensorize.py | 30 ++-- ..._tir_transform_convert_for_loops_serial.py | 8 +- ...est_tir_transform_inject_rolling_buffer.py | 12 +- tests/python/unittest/test_tir_usmp_algo.py | 18 +-- ...st_tir_usmp_analysis_extract_bufferinfo.py | 138 +++++++++--------- ...orm_convert_pool_allocations_to_offsets.py | 36 ++--- ..._tir_usmp_transform_create_io_allocates.py | 48 +++--- tests/python/unittest/test_tir_usmp_utils.py | 18 +-- .../unittest/test_tvmscript_complete.py | 18 +-- .../unittest/test_tvmscript_roundtrip.py | 36 ++--- .../unittest/test_tvmscript_syntax_sugar.py | 12 +- tests/python/unittest/test_tvmscript_type.py | 6 +- 21 files changed, 297 insertions(+), 299 deletions(-) diff --git a/python/tvm/tir/schedule/schedule.py b/python/tvm/tir/schedule/schedule.py index e18bee35a5e1..e3c2d651e632 100644 --- a/python/tvm/tir/schedule/schedule.py +++ b/python/tvm/tir/schedule/schedule.py @@ -2092,9 +2092,9 @@ def before_tensorize( @T.prim_func def mma_desc(a: T.handle, b: T.handle, c: T.handle) -> None: - A = T.match_buffer(a, (16, 16), align=128, offset_factor=1) - B = T.match_buffer(b, (16, 16), align=128, offset_factor=1) - C = T.match_buffer(c, (16, 16), align=128, offset_factor=1) + A = T.match_buffer(a, (16, 16), align=64, offset_factor=1) + B = T.match_buffer(b, (16, 16), align=64, offset_factor=1) + C = T.match_buffer(c, (16, 16), align=64, offset_factor=1) with T.block("root"): T.reads(C[0 : 16, 0 : 16], A[0 : 16, 0 : 16], B[0 : 16, 0 : 16]) @@ -2107,9 +2107,9 @@ def mma_desc(a: T.handle, b: T.handle, c: T.handle) -> None: @T.prim_func def mma_intrin(a: T.handle, b: T.handle, c: T.handle) -> None: - A = T.match_buffer(a, (16, 16), align=128, offset_factor=1) - B = T.match_buffer(b, (16, 16), align=128, offset_factor=1) - C = T.match_buffer(c, (16, 16), align=128, offset_factor=1) + A = T.match_buffer(a, (16, 16), align=64, offset_factor=1) + B = T.match_buffer(b, (16, 16), align=64, offset_factor=1) + C = T.match_buffer(c, (16, 16), align=64, offset_factor=1) with T.block("root"): T.reads(C[0 : 16, 0 : 16], A[0 : 16, 0 : 16], B[0 : 16, 0 : 16]) diff --git a/python/tvm/tir/tensor_intrin/cuda.py b/python/tvm/tir/tensor_intrin/cuda.py index b4f5d1d331e5..64d7c24840ae 100644 --- a/python/tvm/tir/tensor_intrin/cuda.py +++ b/python/tvm/tir/tensor_intrin/cuda.py @@ -120,12 +120,12 @@ def ldmatrix_desc(warp_handle: T.handle, shared_handle: T.handle) -> None: shared_handle, shmem_shape, dtype, - align=128, + align=64, offset_factor=16, scope=shared_scope, ) warp = T.match_buffer( - warp_handle, (WARP_SIZE, local_size), dtype, align=128, offset_factor=16, scope="warp" + warp_handle, (WARP_SIZE, local_size), dtype, align=64, offset_factor=16, scope="warp" ) with T.block("root"): @@ -149,13 +149,13 @@ def ldmatrix_impl(warp_handle: T.handle, shared_handle: T.handle) -> None: shared_handle, shmem_shape, dtype, - align=128, + align=64, offset_factor=16, scope=shared_scope, strides=[s0, s1], ) warp = T.match_buffer( - warp_handle, (WARP_SIZE, local_size), dtype, align=128, offset_factor=16, scope="warp" + warp_handle, (WARP_SIZE, local_size), dtype, align=64, offset_factor=16, scope="warp" ) with T.block("root"): @@ -222,13 +222,13 @@ def maybe_swap(i, j): @T.prim_func def mma_sync_desc(a: T.handle, b: T.handle, c: T.handle) -> None: A = T.match_buffer( - a, (WARP_SIZE, local_size), in_dtype, align=128, offset_factor=16, scope="warp" + a, (WARP_SIZE, local_size), in_dtype, align=64, offset_factor=16, scope="warp" ) B = T.match_buffer( - b, (WARP_SIZE, local_size), in_dtype, align=128, offset_factor=16, scope="warp" + b, (WARP_SIZE, local_size), in_dtype, align=64, offset_factor=16, scope="warp" ) C = T.match_buffer( - c, (WARP_SIZE, local_size_out), out_dtype, align=128, offset_factor=16, scope="warp" + c, (WARP_SIZE, local_size_out), out_dtype, align=64, offset_factor=16, scope="warp" ) with T.block("root"): @@ -262,13 +262,13 @@ def mma_sync_desc(a: T.handle, b: T.handle, c: T.handle) -> None: @T.prim_func def mma_sync_impl(a: T.handle, b: T.handle, c: T.handle) -> None: A = T.match_buffer( - a, (WARP_SIZE, local_size), in_dtype, align=128, offset_factor=16, scope="warp" + a, (WARP_SIZE, local_size), in_dtype, align=64, offset_factor=16, scope="warp" ) B = T.match_buffer( - b, (WARP_SIZE, local_size), in_dtype, align=128, offset_factor=16, scope="warp" + b, (WARP_SIZE, local_size), in_dtype, align=64, offset_factor=16, scope="warp" ) C = T.match_buffer( - c, (WARP_SIZE, local_size_out), out_dtype, align=128, offset_factor=16, scope="warp" + c, (WARP_SIZE, local_size_out), out_dtype, align=64, offset_factor=16, scope="warp" ) with T.block("root"): @@ -510,11 +510,9 @@ def get_wmma_load_intrin( @T.prim_func def wmma_load_desc(a: T.handle, c: T.handle) -> None: - A = T.match_buffer( - a, (m_dim, n_dim), dtype, align=128, offset_factor=16, scope=shared_scope - ) + A = T.match_buffer(a, (m_dim, n_dim), dtype, align=64, offset_factor=16, scope=shared_scope) C = T.match_buffer( - c, (m_dim, n_dim), dtype, align=128, offset_factor=16, scope=wmma_fragment_scope + c, (m_dim, n_dim), dtype, align=64, offset_factor=16, scope=wmma_fragment_scope ) with T.block("root"): T.reads(A[0:m_dim, 0:n_dim]) @@ -532,13 +530,13 @@ def wmma_load_impl(a: T.handle, c: T.handle) -> None: a, (m_dim, n_dim), dtype, - align=128, + align=64, offset_factor=16, scope=shared_scope, strides=[s1, s0], ) C = T.match_buffer( - c, (m_dim, n_dim), dtype, align=128, offset_factor=16, scope=wmma_fragment_scope + c, (m_dim, n_dim), dtype, align=64, offset_factor=16, scope=wmma_fragment_scope ) with T.block("root"): T.reads(A[0:m_dim, 0:n_dim]) @@ -569,7 +567,7 @@ def get_wmma_fill_intrin( @T.prim_func def wmma_fill_desc(c: T.handle) -> None: C = T.match_buffer( - c, (m_dim, n_dim), dtype, align=128, offset_factor=16, scope="wmma.accumulator" + c, (m_dim, n_dim), dtype, align=64, offset_factor=16, scope="wmma.accumulator" ) with T.block("root"): T.reads() @@ -582,7 +580,7 @@ def wmma_fill_desc(c: T.handle) -> None: @T.prim_func def wmma_fill_impl(c: T.handle) -> None: C = T.match_buffer( - c, (m_dim, n_dim), dtype, align=128, offset_factor=16, scope="wmma.accumulator" + c, (m_dim, n_dim), dtype, align=64, offset_factor=16, scope="wmma.accumulator" ) with T.block("root"): T.reads() @@ -610,9 +608,9 @@ def get_wmma_store_intrin( @T.prim_func def wmma_store_desc(a: T.handle, c: T.handle) -> None: A = T.match_buffer( - a, (m_dim, n_dim), dtype, align=128, offset_factor=16, scope="wmma.accumulator" + a, (m_dim, n_dim), dtype, align=64, offset_factor=16, scope="wmma.accumulator" ) - C = T.match_buffer(c, (m_dim, n_dim), dtype, align=128, offset_factor=16, scope=scope) + C = T.match_buffer(c, (m_dim, n_dim), dtype, align=64, offset_factor=16, scope=scope) with T.block("root"): T.reads(A[0:m_dim, 0:n_dim]) T.writes(C[0:m_dim, 0:n_dim]) @@ -626,10 +624,10 @@ def wmma_store_impl(a: T.handle, c: T.handle) -> None: s1 = T.var("int32") s0 = T.var("int32") A = T.match_buffer( - a, (m_dim, n_dim), dtype, align=128, offset_factor=16, scope="wmma.accumulator" + a, (m_dim, n_dim), dtype, align=64, offset_factor=16, scope="wmma.accumulator" ) C = T.match_buffer( - c, (m_dim, n_dim), dtype, align=128, offset_factor=16, scope=scope, strides=[s1, s0] + c, (m_dim, n_dim), dtype, align=64, offset_factor=16, scope=scope, strides=[s1, s0] ) with T.block("root"): T.reads(A[0:m_dim, 0:n_dim]) @@ -671,18 +669,18 @@ def maybe_swap(i, j): @T.prim_func def wmma_sync_desc(a: T.handle, b: T.handle, c: T.handle) -> None: A = T.match_buffer( - a, (m_dim, k_dim), in_dtype, align=128, offset_factor=16, scope="wmma.matrix_a" + a, (m_dim, k_dim), in_dtype, align=64, offset_factor=16, scope="wmma.matrix_a" ) B = T.match_buffer( b, maybe_swap(k_dim, n_dim), in_dtype, - align=128, + align=64, offset_factor=16, scope="wmma.matrix_b", ) C = T.match_buffer( - c, (m_dim, n_dim), out_dtype, align=128, offset_factor=16, scope="wmma.accumulator" + c, (m_dim, n_dim), out_dtype, align=64, offset_factor=16, scope="wmma.accumulator" ) with T.block("root"): @@ -699,18 +697,18 @@ def wmma_sync_desc(a: T.handle, b: T.handle, c: T.handle) -> None: @T.prim_func def wmma_sync_impl(a: T.handle, b: T.handle, c: T.handle) -> None: A = T.match_buffer( - a, (m_dim, k_dim), in_dtype, align=128, offset_factor=16, scope="wmma.matrix_a" + a, (m_dim, k_dim), in_dtype, align=64, offset_factor=16, scope="wmma.matrix_a" ) B = T.match_buffer( b, maybe_swap(k_dim, n_dim), in_dtype, - align=128, + align=64, offset_factor=16, scope="wmma.matrix_b", ) C = T.match_buffer( - c, (m_dim, n_dim), out_dtype, align=128, offset_factor=16, scope="wmma.accumulator" + c, (m_dim, n_dim), out_dtype, align=64, offset_factor=16, scope="wmma.accumulator" ) with T.block("root"): diff --git a/tests/python/contrib/test_ethosu/test_tir_to_cs_translator.py b/tests/python/contrib/test_ethosu/test_tir_to_cs_translator.py index 28522138cafc..e1a0e143281b 100644 --- a/tests/python/contrib/test_ethosu/test_tir_to_cs_translator.py +++ b/tests/python/contrib/test_ethosu/test_tir_to_cs_translator.py @@ -525,10 +525,10 @@ class SingleEthosuDepthwiseConv2D: def main(placeholder: T.handle, placeholder_1: T.handle, placeholder_2: T.handle, ethosu_depthwise_conv2d: T.handle) -> None: # function attr dict T.func_attr({"global_symbol": "main", "tir.noalias": True}) - placeholder_4 = T.match_buffer(placeholder_1, [18], dtype="int8", elem_offset=0, align=128, offset_factor=1) - placeholder_5 = T.match_buffer(placeholder_2, [30], dtype="uint8", elem_offset=0, align=128, offset_factor=1) - placeholder_3 = T.match_buffer(placeholder, [192], dtype="int8", elem_offset=0, align=128, offset_factor=1) - ethosu_depthwise_conv2d_1 = T.match_buffer(ethosu_depthwise_conv2d, [126], dtype="int8", elem_offset=0, align=128, offset_factor=1) + placeholder_4 = T.match_buffer(placeholder_1, [18], dtype="int8", elem_offset=0, align=64, offset_factor=1) + placeholder_5 = T.match_buffer(placeholder_2, [30], dtype="uint8", elem_offset=0, align=64, offset_factor=1) + placeholder_3 = T.match_buffer(placeholder, [192], dtype="int8", elem_offset=0, align=64, offset_factor=1) + ethosu_depthwise_conv2d_1 = T.match_buffer(ethosu_depthwise_conv2d, [126], dtype="int8", elem_offset=0, align=64, offset_factor=1) # body T.evaluate(T.call_extern("ethosu_depthwise_conv2d", "int8", 8, 8, 3, 8, 0, 8, placeholder_3[0], 0, 0, 0, T.float32(0.6), 11, "NHWC", 24, 3, 1, "int8", 6, 7, 3, 6, 0, 7, ethosu_depthwise_conv2d_1[0], 0, 0, 0, T.float32(0.26), 15, "NHWC", 21, 3, 1, 2, 3, 1, 1, 1, 1, placeholder_4[0], 18, 13, placeholder_5[0], 30, 0, 0, 0, 0, "CLIP", 15, 105, "TFL", "NONE", 0, 0, 0, dtype="int8")) __tvm_meta__ = None @@ -991,8 +991,8 @@ class SingleEthosuPooling: def main(placeholder: T.handle, placeholder_3: T.handle, ethosu_write: T.handle) -> None: # function attr dict T.func_attr({"global_symbol": "main", "tir.noalias": True}) - placeholder_4 = T.match_buffer(placeholder, [135], dtype="int8", elem_offset=0, align=128, offset_factor=1) - ethosu_write_2 = T.match_buffer(ethosu_write, [75], dtype="int8", elem_offset=0, align=128, offset_factor=1) + placeholder_4 = T.match_buffer(placeholder, [135], dtype="int8", elem_offset=0, align=64, offset_factor=1) + ethosu_write_2 = T.match_buffer(ethosu_write, [75], dtype="int8", elem_offset=0, align=64, offset_factor=1) # body T.evaluate(T.call_extern("ethosu_pooling", "int8", 5, 9, 3, 5, 0, 9, placeholder_4[0], 0, 0, 0, T.float32(1.0), 0, "NHWC", 27, 3, 1, "int8", 5, 5, 3, 5, 0, 5, ethosu_write_2[0], 0, 0, 0, T.float32(1.0), 0, "NHWC", 15, 3, 1, "AVG", 2, 3, 2, 1, 1, 1, 1, 1, 1, 0, "CLIP", 10, 100, "TFL", "NONE", 0, 0, 0, dtype="int8")) __tvm_meta__ = None @@ -1065,10 +1065,10 @@ def main(placeholder: T.handle, ethosu_write: T.handle) -> None: # function attr dict T.func_attr({"global_symbol": "main", "tir.noalias": True}) placeholder_2 = T.match_buffer( - placeholder, [270], dtype="int8", elem_offset=0, align=128, offset_factor=1 + placeholder, [270], dtype="int8", elem_offset=0, align=64, offset_factor=1 ) ethosu_write_2 = T.match_buffer( - ethosu_write, [135], dtype="int8", elem_offset=0, align=128, offset_factor=1 + ethosu_write, [135], dtype="int8", elem_offset=0, align=64, offset_factor=1 ) # body T.evaluate(T.call_extern( "ethosu_binary_elementwise", "int8", 5, 9, 3, 5, 0, 9, placeholder_2[0], 0, 0, 0, T.float32(1.0), 0, "NHWC", 27, 3, 1, "int8", 5, 9, 3, 5, 0, 9, placeholder_2[135], 0, 0, 0, T.float32(1.0), 0, "NHWC", 27, 3, 1, "int8", 5, 9, 3, 5, 0, 9, ethosu_write_2[0], 0, 0, 0, T.float32(1.0), 0, "NHWC", 27, 3, 1, "ADD", 0, "CLIP", 10, 100, "TFL", 0, 0, 0, dtype="int8")) @@ -1084,8 +1084,8 @@ class SingleEthosuBinaryElementwiseSub: def main(placeholder: T.handle, ethosu_write: T.handle) -> None: # function attr dict T.func_attr({"global_symbol": "main", "tir.noalias": True}) - placeholder_2 = T.match_buffer(placeholder, [270], dtype="int8", elem_offset=0, align=128, offset_factor=1) - ethosu_write_2 = T.match_buffer(ethosu_write, [135], dtype="int8", elem_offset=0, align=128, offset_factor=1) + placeholder_2 = T.match_buffer(placeholder, [270], dtype="int8", elem_offset=0, align=64, offset_factor=1) + ethosu_write_2 = T.match_buffer(ethosu_write, [135], dtype="int8", elem_offset=0, align=64, offset_factor=1) # body T.evaluate(T.call_extern("ethosu_binary_elementwise", "int8", 5, 9, 3, 5, 0, 9, placeholder_2[0], 0, 0, 0, T.float32(1.0), 0, "NHWC", 27, 3, 1, "int8", 5, 9, 3, 5, 0, 9, placeholder_2[135], 0, 0, 0, T.float32(1.0), 0, "NHWC", 27, 3, 1, "int8", 5, 9, 3, 5, 0, 9, ethosu_write_2[0], 0, 0, 0, T.float32(1.0), 0, "NHWC", 27, 3, 1, "SUB", 0, "CLIP", 10, 100, "TFL", 0, 0, 0, dtype="int8")) __tvm_meta__ = None @@ -1099,8 +1099,8 @@ class SingleEthosuBinaryElementwiseMul: def main(placeholder: T.handle, ethosu_write: T.handle) -> None: # function attr dict T.func_attr({"global_symbol": "main", "tir.noalias": True}) - placeholder_2 = T.match_buffer(placeholder, [270], dtype="int8", elem_offset=0, align=128, offset_factor=1) - ethosu_write_2 = T.match_buffer(ethosu_write, [135], dtype="int8", elem_offset=0, align=128, offset_factor=1) + placeholder_2 = T.match_buffer(placeholder, [270], dtype="int8", elem_offset=0, align=64, offset_factor=1) + ethosu_write_2 = T.match_buffer(ethosu_write, [135], dtype="int8", elem_offset=0, align=64, offset_factor=1) # body T.evaluate(T.call_extern("ethosu_binary_elementwise", "int8", 5, 9, 3, 5, 0, 9, placeholder_2[0], 0, 0, 0, T.float32(1.0), 0, "NHWC", 27, 3, 1, "int8", 5, 9, 3, 5, 0, 9, placeholder_2[135], 0, 0, 0, T.float32(1.0), 0, "NHWC", 27, 3, 1, "int8", 5, 9, 3, 5, 0, 9, ethosu_write_2[0], 0, 0, 0, T.float32(1.0), 0, "NHWC", 27, 3, 1, "MUL", 0, "CLIP", 10, 100, "TFL", 0, 0, 0, dtype="int8")) __tvm_meta__ = None @@ -1115,8 +1115,8 @@ class SingleEthosuBinaryElementwiseMin: def main(placeholder: T.handle, ethosu_write: T.handle) -> None: # function attr dict T.func_attr({"global_symbol": "main", "tir.noalias": True}) - placeholder_2 = T.match_buffer(placeholder, [270], dtype="int8", elem_offset=0, align=128, offset_factor=1) - ethosu_write_2 = T.match_buffer(ethosu_write, [135], dtype="int8", elem_offset=0, align=128, offset_factor=1) + placeholder_2 = T.match_buffer(placeholder, [270], dtype="int8", elem_offset=0, align=64, offset_factor=1) + ethosu_write_2 = T.match_buffer(ethosu_write, [135], dtype="int8", elem_offset=0, align=64, offset_factor=1) # body T.evaluate(T.call_extern("ethosu_binary_elementwise", "int8", 5, 9, 3, 5, 0, 9, placeholder_2[0], 0, 0, 0, T.float32(1.0), 0, "NHWC", 27, 3, 1, "int8", 5, 9, 3, 5, 0, 9, placeholder_2[135], 0, 0, 0, T.float32(1.0), 0, "NHWC", 27, 3, 1, "int8", 5, 9, 3, 5, 0, 9, ethosu_write_2[0], 0, 0, 0, T.float32(1.0), 0, "NHWC", 27, 3, 1, "MIN", 0, "CLIP", 10, 100, "TFL", 0, 0, 0, dtype="int8")) __tvm_meta__ = None @@ -1131,8 +1131,8 @@ class SingleEthosuBinaryElementwiseMax: def main(placeholder: T.handle, ethosu_write: T.handle) -> None: # function attr dict T.func_attr({"global_symbol": "main", "tir.noalias": True}) - placeholder_2 = T.match_buffer(placeholder, [270], dtype="int8", elem_offset=0, align=128, offset_factor=1) - ethosu_write_2 = T.match_buffer(ethosu_write, [135], dtype="int8", elem_offset=0, align=128, offset_factor=1) + placeholder_2 = T.match_buffer(placeholder, [270], dtype="int8", elem_offset=0, align=64, offset_factor=1) + ethosu_write_2 = T.match_buffer(ethosu_write, [135], dtype="int8", elem_offset=0, align=64, offset_factor=1) # body T.evaluate(T.call_extern("ethosu_binary_elementwise", "int8", 5, 9, 3, 5, 0, 9, placeholder_2[0], 0, 0, 0, T.float32(1.0), 0, "NHWC", 27, 3, 1, "int8", 5, 9, 3, 5, 0, 9, placeholder_2[135], 0, 0, 0, T.float32(1.0), 0, "NHWC", 27, 3, 1, "int8", 5, 9, 3, 5, 0, 9, ethosu_write_2[0], 0, 0, 0, T.float32(1.0), 0, "NHWC", 27, 3, 1, "MAX", 0, "CLIP", 10, 100, "TFL", 0, 0, 0, dtype="int8")) __tvm_meta__ = None @@ -1147,8 +1147,8 @@ class SingleEthosuBinaryElementwiseShr: def main(placeholder: T.handle, ethosu_write: T.handle) -> None: # function attr dict T.func_attr({"global_symbol": "main", "tir.noalias": True}) - placeholder_2 = T.match_buffer(placeholder, [270], dtype="int32", elem_offset=0, align=128, offset_factor=1) - ethosu_write_2 = T.match_buffer(ethosu_write, [135], dtype="int32", elem_offset=0, align=128, offset_factor=1) + placeholder_2 = T.match_buffer(placeholder, [270], dtype="int32", elem_offset=0, align=64, offset_factor=1) + ethosu_write_2 = T.match_buffer(ethosu_write, [135], dtype="int32", elem_offset=0, align=64, offset_factor=1) # body T.evaluate(T.call_extern("ethosu_binary_elementwise", "int32", 5, 9, 3, 5, 0, 9, placeholder_2[0], 0, 0, 0, T.float32(1.0), 0, "NHWC", 27, 3, 1, "int32", 5, 9, 3, 5, 0, 9, placeholder_2[135], 0, 0, 0, T.float32(1.0), 0, "NHWC", 27, 3, 1, "int32", 5, 9, 3, 5, 0, 9, ethosu_write_2[0], 0, 0, 0, T.float32(1.0), 0, "NHWC", 27, 3, 1, "SHR", 0, "NONE", 0, 0, "TFL", 0, 0, 0, dtype="int32")) __tvm_meta__ = None @@ -1163,8 +1163,8 @@ class SingleEthosuBinaryElementwiseShl: def main(placeholder: T.handle, ethosu_write: T.handle) -> None: # function attr dict T.func_attr({"global_symbol": "main", "tir.noalias": True}) - placeholder_2 = T.match_buffer(placeholder, [270], dtype="int32", elem_offset=0, align=128, offset_factor=1) - ethosu_write_2 = T.match_buffer(ethosu_write, [135], dtype="int32", elem_offset=0, align=128, offset_factor=1) + placeholder_2 = T.match_buffer(placeholder, [270], dtype="int32", elem_offset=0, align=64, offset_factor=1) + ethosu_write_2 = T.match_buffer(ethosu_write, [135], dtype="int32", elem_offset=0, align=64, offset_factor=1) # body T.evaluate(T.call_extern("ethosu_binary_elementwise", "int32", 5, 9, 3, 5, 0, 9, placeholder_2[0], 0, 0, 0, T.float32(1.0), 0, "NHWC", 27, 3, 1, "int32", 5, 9, 3, 5, 0, 9, placeholder_2[135], 0, 0, 0, T.float32(1.0), 0, "NHWC", 27, 3, 1, "int32", 5, 9, 3, 5, 0, 9, ethosu_write_2[0], 0, 0, 0, T.float32(1.0), 0, "NHWC", 27, 3, 1, "SHL", 0, "CLIP", 10, 100, "TFL", 0, 0, 0, dtype="int32")) __tvm_meta__ = None @@ -1284,8 +1284,8 @@ class SingleEthosuBinaryElementwiseAddBroadcasting: def main(placeholder: T.handle, ethosu_write: T.handle) -> None: # function attr dict T.func_attr({"global_symbol": "main", "tir.noalias": True}) - placeholder_2 = T.match_buffer(placeholder, [27], dtype="int8", elem_offset=0, align=128, offset_factor=1) - ethosu_write_2 = T.match_buffer(ethosu_write, [24], dtype="int8", elem_offset=0, align=128, offset_factor=1) + placeholder_2 = T.match_buffer(placeholder, [27], dtype="int8", elem_offset=0, align=64, offset_factor=1) + ethosu_write_2 = T.match_buffer(ethosu_write, [24], dtype="int8", elem_offset=0, align=64, offset_factor=1) # body T.evaluate(T.call_extern("ethosu_binary_elementwise", "int8", 2, 3, 4, 2, 0, 3, placeholder_2[0], 0, 0, 0, T.float32(1.0), 0, "NHWC", 12, 4, 1, "int8", 1, 3, 1, 1, 0, 3, placeholder_2[0], 0, 0, 0, T.float32(1.0), 0, "NHWC", 1, 1, 1, "int8", 2, 3, 4, 2, 0, 3, ethosu_write_2[0], 0, 0, 0, T.float32(1.0), 0, "NHWC", 12, 4, 1, "ADD", 1, "CLIP", 10, 100, "TFL", 0, 0, 0, dtype="int8")) __tvm_meta__ = None @@ -1299,8 +1299,8 @@ class SingleEthosuBinaryElementwiseSubBroadcasting: def main(placeholder: T.handle, ethosu_write: T.handle) -> None: # function attr dict T.func_attr({"global_symbol": "main", "tir.noalias": True}) - placeholder_2 = T.match_buffer(placeholder, [27], dtype="int8", elem_offset=0, align=128, offset_factor=1) - ethosu_write_2 = T.match_buffer(ethosu_write, [24], dtype="int8", elem_offset=0, align=128, offset_factor=1) + placeholder_2 = T.match_buffer(placeholder, [27], dtype="int8", elem_offset=0, align=64, offset_factor=1) + ethosu_write_2 = T.match_buffer(ethosu_write, [24], dtype="int8", elem_offset=0, align=64, offset_factor=1) # body T.evaluate(T.call_extern("ethosu_binary_elementwise", "int8", 2, 3, 4, 2, 0, 3, placeholder_2[0], 0, 0, 0, T.float32(1.0), 0, "NHWC", 12, 4, 1, "int8", 1, 3, 1, 1, 0, 3, placeholder_2[0], 0, 0, 0, T.float32(1.0), 0, "NHWC", 1, 1, 1, "int8", 2, 3, 4, 2, 0, 3, ethosu_write_2[0], 0, 0, 0, T.float32(1.0), 0, "NHWC", 12, 4, 1, "SUB", 1, "CLIP", 10, 100, "TFL", 0, 0, 0, dtype="int8")) __tvm_meta__ = None @@ -1314,8 +1314,8 @@ class SingleEthosuBinaryElementwiseMulBroadcasting: def main(placeholder: T.handle, ethosu_write: T.handle) -> None: # function attr dict T.func_attr({"global_symbol": "main", "tir.noalias": True}) - placeholder_2 = T.match_buffer(placeholder, [27], dtype="int8", elem_offset=0, align=128, offset_factor=1) - ethosu_write_2 = T.match_buffer(ethosu_write, [24], dtype="int8", elem_offset=0, align=128, offset_factor=1) + placeholder_2 = T.match_buffer(placeholder, [27], dtype="int8", elem_offset=0, align=64, offset_factor=1) + ethosu_write_2 = T.match_buffer(ethosu_write, [24], dtype="int8", elem_offset=0, align=64, offset_factor=1) # body T.evaluate(T.call_extern("ethosu_binary_elementwise", "int8", 2, 3, 4, 2, 0, 3, placeholder_2[0], 0, 0, 0, T.float32(1.0), 0, "NHWC", 12, 4, 1, "int8", 1, 3, 1, 1, 0, 3, placeholder_2[0], 0, 0, 0, T.float32(1.0), 0, "NHWC", 1, 1, 1, "int8", 2, 3, 4, 2, 0, 3, ethosu_write_2[0], 0, 0, 0, T.float32(1.0), 0, "NHWC", 12, 4, 1, "MUL", 1, "CLIP", 10, 100, "TFL", 0, 0, 0, dtype="int8")) __tvm_meta__ = None @@ -1330,8 +1330,8 @@ class SingleEthosuBinaryElementwiseMinBroadcasting: def main(placeholder: T.handle, ethosu_write: T.handle) -> None: # function attr dict T.func_attr({"global_symbol": "main", "tir.noalias": True}) - placeholder_2 = T.match_buffer(placeholder, [27], dtype="int8", elem_offset=0, align=128, offset_factor=1) - ethosu_write_2 = T.match_buffer(ethosu_write, [24], dtype="int8", elem_offset=0, align=128, offset_factor=1) + placeholder_2 = T.match_buffer(placeholder, [27], dtype="int8", elem_offset=0, align=64, offset_factor=1) + ethosu_write_2 = T.match_buffer(ethosu_write, [24], dtype="int8", elem_offset=0, align=64, offset_factor=1) # body T.evaluate(T.call_extern("ethosu_binary_elementwise", "int8", 2, 3, 4, 2, 0, 3, placeholder_2[0], 0, 0, 0, T.float32(1.0), 0, "NHWC", 12, 4, 1, "int8", 1, 3, 1, 1, 0, 3, placeholder_2[0], 0, 0, 0, T.float32(1.0), 0, "NHWC", 1, 1, 1, "int8", 2, 3, 4, 2, 0, 3, ethosu_write_2[0], 0, 0, 0, T.float32(1.0), 0, "NHWC", 12, 4, 1, "MIN", 1, "CLIP", 10, 100, "TFL", 0, 0, 0, dtype="int8")) __tvm_meta__ = None @@ -1346,8 +1346,8 @@ class SingleEthosuBinaryElementwiseMaxBroadcasting: def main(placeholder: T.handle, ethosu_write: T.handle) -> None: # function attr dict T.func_attr({"global_symbol": "main", "tir.noalias": True}) - placeholder_2 = T.match_buffer(placeholder, [27], dtype="int8", elem_offset=0, align=128, offset_factor=1) - ethosu_write_2 = T.match_buffer(ethosu_write, [24], dtype="int8", elem_offset=0, align=128, offset_factor=1) + placeholder_2 = T.match_buffer(placeholder, [27], dtype="int8", elem_offset=0, align=64, offset_factor=1) + ethosu_write_2 = T.match_buffer(ethosu_write, [24], dtype="int8", elem_offset=0, align=64, offset_factor=1) # body T.evaluate(T.call_extern("ethosu_binary_elementwise", "int8", 2, 3, 4, 2, 0, 3, placeholder_2[0], 0, 0, 0, T.float32(1.0), 0, "NHWC", 12, 4, 1, "int8", 1, 3, 1, 1, 0, 3, placeholder_2[0], 0, 0, 0, T.float32(1.0), 0, "NHWC", 1, 1, 1, "int8", 2, 3, 4, 2, 0, 3, ethosu_write_2[0], 0, 0, 0, T.float32(1.0), 0, "NHWC", 12, 4, 1, "MAX", 1, "CLIP", 10, 100, "TFL", 0, 0, 0, dtype="int8")) __tvm_meta__ = None @@ -1362,8 +1362,8 @@ class SingleEthosuBinaryElementwiseShrBroadcasting: def main(placeholder: T.handle, ethosu_write: T.handle) -> None: # function attr dict T.func_attr({"global_symbol": "main", "tir.noalias": True}) - placeholder_2 = T.match_buffer(placeholder, [27], dtype="int32", elem_offset=0, align=128, offset_factor=1) - ethosu_write_2 = T.match_buffer(ethosu_write, [24], dtype="int32", elem_offset=0, align=128, offset_factor=1) + placeholder_2 = T.match_buffer(placeholder, [27], dtype="int32", elem_offset=0, align=64, offset_factor=1) + ethosu_write_2 = T.match_buffer(ethosu_write, [24], dtype="int32", elem_offset=0, align=64, offset_factor=1) # body T.evaluate(T.call_extern("ethosu_binary_elementwise", "int32", 2, 3, 4, 2, 0, 3, placeholder_2[0], 0, 0, 0, T.float32(1.0), 0, "NHWC", 12, 4, 1, "int32", 1, 3, 1, 1, 0, 3, placeholder_2[0], 0, 0, 0, T.float32(1.0), 0, "NHWC", 1, 1, 1, "int32", 2, 3, 4, 2, 0, 3, ethosu_write_2[0], 0, 0, 0, T.float32(1.0), 0, "NHWC", 12, 4, 1, "SHR", 1, "NONE", 0, 0, "TFL", 0, 0, 0, dtype="int32")) __tvm_meta__ = None @@ -1378,8 +1378,8 @@ class SingleEthosuBinaryElementwiseShlBroadcasting: def main(placeholder: T.handle, ethosu_write: T.handle) -> None: # function attr dict T.func_attr({"global_symbol": "main", "tir.noalias": True}) - placeholder_2 = T.match_buffer(placeholder, [27], dtype="int32", elem_offset=0, align=128, offset_factor=1) - ethosu_write_2 = T.match_buffer(ethosu_write, [24], dtype="int32", elem_offset=0, align=128, offset_factor=1) + placeholder_2 = T.match_buffer(placeholder, [27], dtype="int32", elem_offset=0, align=64, offset_factor=1) + ethosu_write_2 = T.match_buffer(ethosu_write, [24], dtype="int32", elem_offset=0, align=64, offset_factor=1) # body T.evaluate(T.call_extern("ethosu_binary_elementwise", "int32", 2, 3, 4, 2, 0, 3, placeholder_2[0], 0, 0, 0, T.float32(1.0), 0, "NHWC", 12, 4, 1, "int32", 1, 3, 1, 1, 0, 3, placeholder_2[0], 0, 0, 0, T.float32(1.0), 0, "NHWC", 1, 1, 1, "int32", 2, 3, 4, 2, 0, 3, ethosu_write_2[0], 0, 0, 0, T.float32(1.0), 0, "NHWC", 12, 4, 1, "SHL", 1, "CLIP", 10, 100, "TFL", 0, 0, 0, dtype="int32")) __tvm_meta__ = None diff --git a/tests/python/contrib/test_ethosu/test_vela_api.py b/tests/python/contrib/test_ethosu/test_vela_api.py index e2e4b2cb3a91..75ca22d08202 100644 --- a/tests/python/contrib/test_ethosu/test_vela_api.py +++ b/tests/python/contrib/test_ethosu/test_vela_api.py @@ -50,16 +50,16 @@ def main( # function attr dict T.func_attr({"global_symbol": "main", "tir.noalias": True}) placeholder_3 = T.match_buffer( - placeholder, [192], dtype="uint8", elem_offset=0, align=128, offset_factor=1 + placeholder, [192], dtype="uint8", elem_offset=0, align=64, offset_factor=1 ) placeholder_4 = T.match_buffer( - placeholder_1, [48], dtype="uint8", elem_offset=0, align=128, offset_factor=1 + placeholder_1, [48], dtype="uint8", elem_offset=0, align=64, offset_factor=1 ) placeholder_5 = T.match_buffer( - placeholder_2, [16], dtype="int32", elem_offset=0, align=128, offset_factor=1 + placeholder_2, [16], dtype="int32", elem_offset=0, align=64, offset_factor=1 ) ethosu_conv2d_1 = T.match_buffer( - ethosu_conv2d, [1024], dtype="uint8", elem_offset=0, align=128, offset_factor=1 + ethosu_conv2d, [1024], dtype="uint8", elem_offset=0, align=64, offset_factor=1 ) # body T.evaluate( @@ -142,20 +142,20 @@ def main( # function attr dict T.func_attr({"global_symbol": "main", "tir.noalias": True}) placeholder_3 = T.match_buffer( - placeholder, [192], dtype="uint8", elem_offset=0, align=128, offset_factor=1 + placeholder, [192], dtype="uint8", elem_offset=0, align=64, offset_factor=1 ) placeholder_4 = T.match_buffer( - placeholder_1, [48], dtype="uint8", elem_offset=0, align=128, offset_factor=1 + placeholder_1, [48], dtype="uint8", elem_offset=0, align=64, offset_factor=1 ) placeholder_5 = T.match_buffer( - placeholder_2, [16], dtype="int32", elem_offset=0, align=128, offset_factor=1 + placeholder_2, [16], dtype="int32", elem_offset=0, align=64, offset_factor=1 ) # Per-channel weight scales placeholder_7 = T.match_buffer( - placeholder_6, [16], dtype="float32", elem_offset=0, align=128, offset_factor=1 + placeholder_6, [16], dtype="float32", elem_offset=0, align=64, offset_factor=1 ) ethosu_conv2d_1 = T.match_buffer( - ethosu_conv2d, [1024], dtype="uint8", elem_offset=0, align=128, offset_factor=1 + ethosu_conv2d, [1024], dtype="uint8", elem_offset=0, align=64, offset_factor=1 ) # body T.evaluate( diff --git a/tests/python/unittest/test_tir_analysis_calculate_workspace.py b/tests/python/unittest/test_tir_analysis_calculate_workspace.py index 8d3163c111c8..1d78458b930d 100644 --- a/tests/python/unittest/test_tir_analysis_calculate_workspace.py +++ b/tests/python/unittest/test_tir_analysis_calculate_workspace.py @@ -26,10 +26,10 @@ def primfunc_global_allocates(placeholder_144: T.handle, placeholder_145: T.handle, placeholder_146: T.handle, T_cast_48: T.handle) -> None: # function attr dict T.func_attr({"global_symbol": "fused_nn_conv2d_add_cast_fixed_point_multiply_clip_cast_cast_13", "tir.noalias": True}) - placeholder_147 = T.match_buffer(placeholder_144, [100352], dtype="int16", elem_offset=0, align=128, offset_factor=1) - placeholder_148 = T.match_buffer(placeholder_145, [4608], dtype="int16", elem_offset=0, align=128, offset_factor=1) - placeholder_149 = T.match_buffer(placeholder_146, [512], dtype="int32", elem_offset=0, align=128, offset_factor=1) - T_cast_49 = T.match_buffer(T_cast_48, [100352], dtype="int16", elem_offset=0, align=128, offset_factor=1) + placeholder_147 = T.match_buffer(placeholder_144, [100352], dtype="int16", elem_offset=0, align=64, offset_factor=1) + placeholder_148 = T.match_buffer(placeholder_145, [4608], dtype="int16", elem_offset=0, align=64, offset_factor=1) + placeholder_149 = T.match_buffer(placeholder_146, [512], dtype="int32", elem_offset=0, align=64, offset_factor=1) + T_cast_49 = T.match_buffer(T_cast_48, [100352], dtype="int16", elem_offset=0, align=64, offset_factor=1) # body PaddedInput_22 = T.allocate([131072], "int16", "global") DepthwiseConv2d_9 = T.allocate([100352], "int32", "global") @@ -57,10 +57,10 @@ def primfunc_global_allocates(placeholder_144: T.handle, placeholder_145: T.hand def primfunc_local_allocates(placeholder_162: T.handle, placeholder_163: T.handle, placeholder_164: T.handle, T_cast_76: T.handle) -> None: # function attr dict T.func_attr({"global_symbol": "fused_nn_conv2d_add_cast_fixed_point_multiply_clip_cast_cast_9", "tir.noalias": True}) - placeholder_165 = T.match_buffer(placeholder_162, [100352], dtype="int16", elem_offset=0, align=128, offset_factor=1) - placeholder_166 = T.match_buffer(placeholder_163, [4608], dtype="int16", elem_offset=0, align=128, offset_factor=1) - placeholder_167 = T.match_buffer(placeholder_164, [512], dtype="int32", elem_offset=0, align=128, offset_factor=1) - T_cast_77 = T.match_buffer(T_cast_76, [100352], dtype="int16", elem_offset=0, align=128, offset_factor=1) + placeholder_165 = T.match_buffer(placeholder_162, [100352], dtype="int16", elem_offset=0, align=64, offset_factor=1) + placeholder_166 = T.match_buffer(placeholder_163, [4608], dtype="int16", elem_offset=0, align=64, offset_factor=1) + placeholder_167 = T.match_buffer(placeholder_164, [512], dtype="int32", elem_offset=0, align=64, offset_factor=1) + T_cast_77 = T.match_buffer(T_cast_76, [100352], dtype="int16", elem_offset=0, align=64, offset_factor=1) sid_21 = T.allocate_const([0,1,2,3,4,5,6,7], "int8", [8]) # body PaddedInput_25 = T.allocate([131072], "int16", "global") diff --git a/tests/python/unittest/test_tir_intrin.py b/tests/python/unittest/test_tir_intrin.py index b8061fc0732a..f887f8877a22 100644 --- a/tests/python/unittest/test_tir_intrin.py +++ b/tests/python/unittest/test_tir_intrin.py @@ -203,7 +203,7 @@ def test_tir_fma(A: T.handle, B: T.handle, C: T.handle, d: T.handle) -> None: [n], strides=[stride], elem_offset=0, - align=128, + align=64, offset_factor=1, buffer_type="auto", ) @@ -212,7 +212,7 @@ def test_tir_fma(A: T.handle, B: T.handle, C: T.handle, d: T.handle) -> None: [n], strides=[stride_1], elem_offset=0, - align=128, + align=64, offset_factor=1, buffer_type="auto", ) @@ -221,7 +221,7 @@ def test_tir_fma(A: T.handle, B: T.handle, C: T.handle, d: T.handle) -> None: [n], strides=[stride_2], elem_offset=0, - align=128, + align=64, offset_factor=1, buffer_type="auto", ) @@ -230,7 +230,7 @@ def test_tir_fma(A: T.handle, B: T.handle, C: T.handle, d: T.handle) -> None: [n], strides=[stride_3], elem_offset=0, - align=128, + align=64, offset_factor=1, buffer_type="auto", ) diff --git a/tests/python/unittest/test_tir_schedule_analysis.py b/tests/python/unittest/test_tir_schedule_analysis.py index d3e6033e880c..5524abbaf094 100644 --- a/tests/python/unittest/test_tir_schedule_analysis.py +++ b/tests/python/unittest/test_tir_schedule_analysis.py @@ -218,9 +218,9 @@ def test_get_tensorize_loop_mapping_conv2d_nchwc_vnni(): def test_get_tensorize_loop_mapping_matmul_mma(): @T.prim_func def matmul_16x16x16xf16f16f16_desc( - A: T.Buffer((16, 16), "float16", align=128, offset_factor=1), - B: T.Buffer((16, 16), "float16", align=128, offset_factor=1), - C: T.Buffer((16, 16), "float16", align=128, offset_factor=1), + A: T.Buffer((16, 16), "float16", align=64, offset_factor=1), + B: T.Buffer((16, 16), "float16", align=64, offset_factor=1), + C: T.Buffer((16, 16), "float16", align=64, offset_factor=1), ) -> None: with T.block("root"): T.reads(C[0:16, 0:16], A[0:16, 0:16], B[0:16, 0:16]) diff --git a/tests/python/unittest/test_tir_schedule_reduction.py b/tests/python/unittest/test_tir_schedule_reduction.py index f3503460e50a..1600b27f5e78 100644 --- a/tests/python/unittest/test_tir_schedule_reduction.py +++ b/tests/python/unittest/test_tir_schedule_reduction.py @@ -78,8 +78,8 @@ def matmul_decompose0(a: T.handle, b: T.handle, c: T.handle) -> None: @T.prim_func def matmul_decompose1(a: T.handle, b: T.handle) -> None: - A = T.match_buffer(a, [32, 4, 128], elem_offset=0, align=128, offset_factor=1) - B = T.match_buffer(b, [32, 4], elem_offset=0, align=128, offset_factor=1) + A = T.match_buffer(a, [32, 4, 128], elem_offset=0, align=64, offset_factor=1) + B = T.match_buffer(b, [32, 4], elem_offset=0, align=64, offset_factor=1) for i0 in T.serial(0, 32): with T.block("blockized_B_init"): @@ -100,9 +100,9 @@ def matmul_decompose1(a: T.handle, b: T.handle) -> None: @T.prim_func def matmul_decompose2(a: T.handle, b: T.handle, c: T.handle) -> None: - C = T.match_buffer(c, [128, 128], elem_offset=0, align=128, offset_factor=1) - B = T.match_buffer(b, [128, 128], elem_offset=0, align=128, offset_factor=1) - A = T.match_buffer(a, [128, 128], elem_offset=0, align=128, offset_factor=1) + C = T.match_buffer(c, [128, 128], elem_offset=0, align=64, offset_factor=1) + B = T.match_buffer(b, [128, 128], elem_offset=0, align=64, offset_factor=1) + A = T.match_buffer(a, [128, 128], elem_offset=0, align=64, offset_factor=1) for i0, i1 in T.grid(128, 128): with T.block("update_init"): @@ -130,9 +130,9 @@ def matmul_decompose_fail3(a: T.handle, b: T.handle, c: T.handle) -> None: @T.prim_func def matmul_decompose4(a: T.handle, b: T.handle, c: T.handle) -> None: - C = T.match_buffer(c, [128, 128], elem_offset=0, align=128, offset_factor=1) - B = T.match_buffer(b, [128, 128], elem_offset=0, align=128, offset_factor=1) - A = T.match_buffer(a, [128, 128], elem_offset=0, align=128, offset_factor=1) + C = T.match_buffer(c, [128, 128], elem_offset=0, align=64, offset_factor=1) + B = T.match_buffer(b, [128, 128], elem_offset=0, align=64, offset_factor=1) + A = T.match_buffer(a, [128, 128], elem_offset=0, align=64, offset_factor=1) # body with T.block("root"): T.reads([]) diff --git a/tests/python/unittest/test_tir_schedule_storage_align.py b/tests/python/unittest/test_tir_schedule_storage_align.py index 072640c8f3af..23cb5d3b5339 100644 --- a/tests/python/unittest/test_tir_schedule_storage_align.py +++ b/tests/python/unittest/test_tir_schedule_storage_align.py @@ -26,13 +26,13 @@ @T.prim_func def element_wise(a: T.handle, c: T.handle) -> None: - C = T.match_buffer(c, [128, 128], elem_offset=0, align=128, offset_factor=1) - A = T.match_buffer(a, [128, 128], elem_offset=0, align=128, offset_factor=1) + C = T.match_buffer(c, [128, 128], elem_offset=0, align=64, offset_factor=1) + A = T.match_buffer(a, [128, 128], elem_offset=0, align=64, offset_factor=1) # body with T.block("root"): T.reads([]) T.writes([]) - B = T.alloc_buffer([128, 128], elem_offset=0, align=128, offset_factor=1) + B = T.alloc_buffer([128, 128], elem_offset=0, align=64, offset_factor=1) for i0 in T.serial(0, 128): for ax1 in T.serial(0, 128): with T.block("B"): @@ -50,13 +50,13 @@ def element_wise(a: T.handle, c: T.handle) -> None: @T.prim_func def element_wise_storage_align(a: T.handle, c: T.handle) -> None: - C = T.match_buffer(c, [128, 128], elem_offset=0, align=128, offset_factor=1) - A = T.match_buffer(a, [128, 128], elem_offset=0, align=128, offset_factor=1) + C = T.match_buffer(c, [128, 128], elem_offset=0, align=64, offset_factor=1) + A = T.match_buffer(a, [128, 128], elem_offset=0, align=64, offset_factor=1) # body with T.block("root"): T.reads([]) T.writes([]) - B = T.alloc_buffer([128, 128], elem_offset=0, align=128, offset_factor=1) + B = T.alloc_buffer([128, 128], elem_offset=0, align=64, offset_factor=1) for i0 in T.serial(0, 128): for ax1 in T.serial(0, 128): with T.block("B"): @@ -75,13 +75,13 @@ def element_wise_storage_align(a: T.handle, c: T.handle) -> None: @T.prim_func def element_wise_invalid_annotation(a: T.handle, c: T.handle) -> None: - C = T.match_buffer(c, [128, 128], elem_offset=0, align=128, offset_factor=1) - A = T.match_buffer(a, [128, 128], elem_offset=0, align=128, offset_factor=1) + C = T.match_buffer(c, [128, 128], elem_offset=0, align=64, offset_factor=1) + A = T.match_buffer(a, [128, 128], elem_offset=0, align=64, offset_factor=1) # body with T.block("root"): T.reads([]) T.writes([]) - B = T.alloc_buffer([128, 128], elem_offset=0, align=128, offset_factor=1) + B = T.alloc_buffer([128, 128], elem_offset=0, align=64, offset_factor=1) for i0 in T.serial(0, 128): for ax1 in T.serial(0, 128): with T.block("B"): diff --git a/tests/python/unittest/test_tir_schedule_tensorize.py b/tests/python/unittest/test_tir_schedule_tensorize.py index 929a6cfa19bc..828dad2fc036 100644 --- a/tests/python/unittest/test_tir_schedule_tensorize.py +++ b/tests/python/unittest/test_tir_schedule_tensorize.py @@ -36,9 +36,9 @@ @T.prim_func def mma_desc(a: T.handle, b: T.handle, c: T.handle) -> None: - A = T.match_buffer(a, (16, 16), align=128, offset_factor=1) - B = T.match_buffer(b, (16, 16), align=128, offset_factor=1) - C = T.match_buffer(c, (16, 16), align=128, offset_factor=1) + A = T.match_buffer(a, (16, 16), align=64, offset_factor=1) + B = T.match_buffer(b, (16, 16), align=64, offset_factor=1) + C = T.match_buffer(c, (16, 16), align=64, offset_factor=1) with T.block("root"): T.reads(C[0 : 16, 0 : 16], A[0 : 16, 0 : 16], B[0 : 16, 0 : 16]) @@ -51,9 +51,9 @@ def mma_desc(a: T.handle, b: T.handle, c: T.handle) -> None: @T.prim_func def mma_intrin(a: T.handle, b: T.handle, c: T.handle) -> None: - A = T.match_buffer(a, (16, 16), align=128, offset_factor=1) - B = T.match_buffer(b, (16, 16), align=128, offset_factor=1) - C = T.match_buffer(c, (16, 16), align=128, offset_factor=1) + A = T.match_buffer(a, (16, 16), align=64, offset_factor=1) + B = T.match_buffer(b, (16, 16), align=64, offset_factor=1) + C = T.match_buffer(c, (16, 16), align=64, offset_factor=1) with T.block("root"): T.reads(C[0 : 16, 0 : 16], A[0 : 16, 0 : 16], B[0 : 16, 0 : 16]) @@ -173,9 +173,9 @@ def matmul( @T.prim_func def tensorized_matmul(a: T.handle, b: T.handle, c: T.handle) -> None: - C = T.match_buffer(c, [128, 128], elem_offset=0, align=128, offset_factor=1) - B = T.match_buffer(b, [128, 128], elem_offset=0, align=128, offset_factor=1) - A = T.match_buffer(a, [128, 128], elem_offset=0, align=128, offset_factor=1) + C = T.match_buffer(c, [128, 128], elem_offset=0, align=64, offset_factor=1) + B = T.match_buffer(b, [128, 128], elem_offset=0, align=64, offset_factor=1) + A = T.match_buffer(a, [128, 128], elem_offset=0, align=64, offset_factor=1) for i_outer, j_outer in T.grid(8, 8): for i_inner_init, j_inner_init in T.grid(16, 16): @@ -375,9 +375,9 @@ def tensorized_batch_matmul_outer_product( @T.prim_func def annotated_mma_desc(a: T.handle, b: T.handle, c: T.handle) -> None: - A = T.match_buffer(a, (16, 16), align=128, offset_factor=1) - B = T.match_buffer(b, (16, 16), align=128, offset_factor=1) - C = T.match_buffer(c, (16, 16), align=128, offset_factor=1) + A = T.match_buffer(a, (16, 16), align=64, offset_factor=1) + B = T.match_buffer(b, (16, 16), align=64, offset_factor=1) + C = T.match_buffer(c, (16, 16), align=64, offset_factor=1) with T.block("root"): T.reads(C[0 : 16, 0 : 16], A[0 : 16, 0 : 16], B[0 : 16, 0 : 16]) @@ -406,9 +406,9 @@ def annotated_matmul( @T.prim_func def annotated_tensorized_matmul(a: T.handle, b: T.handle, c: T.handle) -> None: - C = T.match_buffer(c, [128, 128], elem_offset=0, align=128, offset_factor=1) - B = T.match_buffer(b, [128, 128], elem_offset=0, align=128, offset_factor=1) - A = T.match_buffer(a, [128, 128], elem_offset=0, align=128, offset_factor=1) + C = T.match_buffer(c, [128, 128], elem_offset=0, align=64, offset_factor=1) + B = T.match_buffer(b, [128, 128], elem_offset=0, align=64, offset_factor=1) + A = T.match_buffer(a, [128, 128], elem_offset=0, align=64, offset_factor=1) for i_outer, j_outer in T.grid(8, 8): for i_inner_init, j_inner_init in T.grid(16, 16): diff --git a/tests/python/unittest/test_tir_transform_convert_for_loops_serial.py b/tests/python/unittest/test_tir_transform_convert_for_loops_serial.py index 38431705611b..1a3afdd4c1e2 100644 --- a/tests/python/unittest/test_tir_transform_convert_for_loops_serial.py +++ b/tests/python/unittest/test_tir_transform_convert_for_loops_serial.py @@ -26,10 +26,10 @@ def fused_nn_conv2d_add_fixed_point_multiply_clip_cast_cast_2(placeholder_30: T.handle, placeholder_31: T.handle, placeholder_32: T.handle, T_cast_8: T.handle) -> None: # function attr dict T.func_attr({"global_symbol": "fused_nn_conv2d_add_fixed_point_multiply_clip_cast_cast_2", "tir.noalias": True}) - placeholder_33 = T.match_buffer(placeholder_30, [150528], dtype="int16", elem_offset=0, align=128, offset_factor=1) - placeholder_34 = T.match_buffer(placeholder_31, [3072], dtype="int16", elem_offset=0, align=128, offset_factor=1) - placeholder_35 = T.match_buffer(placeholder_32, [16], dtype="int32", elem_offset=0, align=128, offset_factor=1) - T_cast_9 = T.match_buffer(T_cast_8, [12544], dtype="int16", elem_offset=0, align=128, offset_factor=1) + placeholder_33 = T.match_buffer(placeholder_30, [150528], dtype="int16", elem_offset=0, align=64, offset_factor=1) + placeholder_34 = T.match_buffer(placeholder_31, [3072], dtype="int16", elem_offset=0, align=64, offset_factor=1) + placeholder_35 = T.match_buffer(placeholder_32, [16], dtype="int32", elem_offset=0, align=64, offset_factor=1) + T_cast_9 = T.match_buffer(T_cast_8, [12544], dtype="int16", elem_offset=0, align=64, offset_factor=1) # body PaddedInput_3 = T.allocate([150528], "int16", "global") for i0_i1_fused_3 in T.parallel(0, 28): diff --git a/tests/python/unittest/test_tir_transform_inject_rolling_buffer.py b/tests/python/unittest/test_tir_transform_inject_rolling_buffer.py index 073a0ebd4e84..65a586b8ecfd 100644 --- a/tests/python/unittest/test_tir_transform_inject_rolling_buffer.py +++ b/tests/python/unittest/test_tir_transform_inject_rolling_buffer.py @@ -196,9 +196,9 @@ def main(A: T.handle, tensor: T.handle) -> None: # function attr dict T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True}) # buffer definition - tensor_2 = T.buffer_decl([1, 10, 12, 16], dtype="int8", elem_offset=0, align=128, offset_factor=1) - A_1 = T.match_buffer(A, [1, 12, 14, 16], dtype="int8", elem_offset=0, align=128, offset_factor=1) - tensor_1 = T.match_buffer(tensor, [1, 8, 8, 16], dtype="int8", elem_offset=0, align=128, offset_factor=1) + tensor_2 = T.buffer_decl([1, 10, 12, 16], dtype="int8", elem_offset=0, align=64, offset_factor=1) + A_1 = T.match_buffer(A, [1, 12, 14, 16], dtype="int8", elem_offset=0, align=64, offset_factor=1) + tensor_1 = T.match_buffer(tensor, [1, 8, 8, 16], dtype="int8", elem_offset=0, align=64, offset_factor=1) # body T.realize(tensor_1[0:1, 0:8, 0:8, 0:16], "") for ax1_outer in T.serial(0, 2): @@ -228,9 +228,9 @@ def main(A: T.handle, tensor: T.handle) -> None: # function attr dict T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True}) # buffer definition - tensor_2 = T.buffer_decl([1, 10, 12, 16], dtype="int8", elem_offset=0, align=128, offset_factor=1) - A_1 = T.match_buffer(A, [1, 12, 14, 16], dtype="int8", elem_offset=0, align=128, offset_factor=1) - tensor_1 = T.match_buffer(tensor, [1, 8, 8, 16], dtype="int8", elem_offset=0, align=128, offset_factor=1) + tensor_2 = T.buffer_decl([1, 10, 12, 16], dtype="int8", elem_offset=0, align=64, offset_factor=1) + A_1 = T.match_buffer(A, [1, 12, 14, 16], dtype="int8", elem_offset=0, align=64, offset_factor=1) + tensor_1 = T.match_buffer(tensor, [1, 8, 8, 16], dtype="int8", elem_offset=0, align=64, offset_factor=1) # body T.realize(tensor_1[0:1, 0:8, 0:8, 0:16], "") T.realize(tensor_2[0:1, 0:6, 0:12, 0:16], "") diff --git a/tests/python/unittest/test_tir_usmp_algo.py b/tests/python/unittest/test_tir_usmp_algo.py index 140f6d1b146e..f67148189d8c 100644 --- a/tests/python/unittest/test_tir_usmp_algo.py +++ b/tests/python/unittest/test_tir_usmp_algo.py @@ -299,9 +299,9 @@ class MobilenetStructure: def tvmgen_default_fused_cast_subtract(placeholder_2: T.handle, placeholder_3: T.handle, T_subtract: T.handle) -> None: # function attr dict T.func_attr({"global_symbol": "tvmgen_default_fused_cast_subtract", "tir.noalias": True}) - placeholder_4 = T.match_buffer(placeholder_2, [150528], dtype="uint8", elem_offset=0, align=128, offset_factor=1) - placeholder_5 = T.match_buffer(placeholder_3, [1], dtype="int16", elem_offset=0, align=128, offset_factor=1) - T_subtract_1 = T.match_buffer(T_subtract, [150528], dtype="int16", elem_offset=0, align=128, offset_factor=1) + placeholder_4 = T.match_buffer(placeholder_2, [150528], dtype="uint8", elem_offset=0, align=64, offset_factor=1) + placeholder_5 = T.match_buffer(placeholder_3, [1], dtype="int16", elem_offset=0, align=64, offset_factor=1) + T_subtract_1 = T.match_buffer(T_subtract, [150528], dtype="int16", elem_offset=0, align=64, offset_factor=1) # body for ax0_ax1_fused_1 in T.serial(0, 224): for ax2_1, ax3_inner_1 in T.grid(224, 3): @@ -311,10 +311,10 @@ def tvmgen_default_fused_cast_subtract(placeholder_2: T.handle, placeholder_3: T def tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast(placeholder_62: T.handle, placeholder_63: T.handle, placeholder_64: T.handle, T_cast_20: T.handle) -> None: # function attr dict T.func_attr({"global_symbol": "tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast", "tir.noalias": True}) - placeholder_65 = T.match_buffer(placeholder_62, [150528], dtype="int16", elem_offset=0, align=128, offset_factor=1) - placeholder_66 = T.match_buffer(placeholder_63, [9408], dtype="int16", elem_offset=0, align=128, offset_factor=1) - placeholder_67 = T.match_buffer(placeholder_64, [64], dtype="int32", elem_offset=0, align=128, offset_factor=1) - T_cast_21 = T.match_buffer(T_cast_20, [802816], dtype="uint8", elem_offset=0, align=128, offset_factor=1) + placeholder_65 = T.match_buffer(placeholder_62, [150528], dtype="int16", elem_offset=0, align=64, offset_factor=1) + placeholder_66 = T.match_buffer(placeholder_63, [9408], dtype="int16", elem_offset=0, align=64, offset_factor=1) + placeholder_67 = T.match_buffer(placeholder_64, [64], dtype="int32", elem_offset=0, align=64, offset_factor=1) + T_cast_21 = T.match_buffer(T_cast_20, [802816], dtype="uint8", elem_offset=0, align=64, offset_factor=1) # body PaddedInput_7 = T.allocate([157323], "int16", "global") for i0_i1_fused_7 in T.serial(0, 229): @@ -333,8 +333,8 @@ def tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast(placeholde def tvmgen_default_fused_nn_max_pool2d_cast(placeholder_28: T.handle, T_cast_6: T.handle) -> None: # function attr dict T.func_attr({"global_symbol": "tvmgen_default_fused_nn_max_pool2d_cast", "tir.noalias": True}) - placeholder_29 = T.match_buffer(placeholder_28, [802816], dtype="uint8", elem_offset=0, align=128, offset_factor=1) - T_cast_7 = T.match_buffer(T_cast_6, [200704], dtype="int16", elem_offset=0, align=128, offset_factor=1) + placeholder_29 = T.match_buffer(placeholder_28, [802816], dtype="uint8", elem_offset=0, align=64, offset_factor=1) + T_cast_7 = T.match_buffer(T_cast_6, [200704], dtype="int16", elem_offset=0, align=64, offset_factor=1) # body tensor_2 = T.allocate([200704], "uint8", "global") for ax0_ax1_fused_4 in T.serial(0, 56): diff --git a/tests/python/unittest/test_tir_usmp_analysis_extract_bufferinfo.py b/tests/python/unittest/test_tir_usmp_analysis_extract_bufferinfo.py index d4e62362495c..60360ecade70 100644 --- a/tests/python/unittest/test_tir_usmp_analysis_extract_bufferinfo.py +++ b/tests/python/unittest/test_tir_usmp_analysis_extract_bufferinfo.py @@ -111,9 +111,9 @@ class LinearStructure: def tvmgen_default_fused_cast_subtract(placeholder_2: T.handle, placeholder_3: T.handle, T_subtract: T.handle) -> None: # function attr dict T.func_attr({"global_symbol": "tvmgen_default_fused_cast_subtract", "tir.noalias": True}) - placeholder_4 = T.match_buffer(placeholder_2, [150528], dtype="uint8", elem_offset=0, align=128, offset_factor=1) - placeholder_5 = T.match_buffer(placeholder_3, [1], dtype="int16", elem_offset=0, align=128, offset_factor=1) - T_subtract_1 = T.match_buffer(T_subtract, [452], dtype="int16", elem_offset=0, align=128, offset_factor=1) + placeholder_4 = T.match_buffer(placeholder_2, [150528], dtype="uint8", elem_offset=0, align=64, offset_factor=1) + placeholder_5 = T.match_buffer(placeholder_3, [1], dtype="int16", elem_offset=0, align=64, offset_factor=1) + T_subtract_1 = T.match_buffer(T_subtract, [452], dtype="int16", elem_offset=0, align=64, offset_factor=1) # body for ax0_ax1_fused_1 in T.serial(0, 224): for ax2_1, ax3_inner_1 in T.grid(224, 3): @@ -123,10 +123,10 @@ def tvmgen_default_fused_cast_subtract(placeholder_2: T.handle, placeholder_3: T def tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast(placeholder_62: T.handle, placeholder_63: T.handle, placeholder_64: T.handle, T_cast_20: T.handle) -> None: # function attr dict T.func_attr({"global_symbol": "tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast", "tir.noalias": True}) - placeholder_65 = T.match_buffer(placeholder_62, [150528], dtype="int16", elem_offset=0, align=128, offset_factor=1) - placeholder_66 = T.match_buffer(placeholder_63, [9408], dtype="int16", elem_offset=0, align=128, offset_factor=1) - placeholder_67 = T.match_buffer(placeholder_64, [64], dtype="int32", elem_offset=0, align=128, offset_factor=1) - T_cast_21 = T.match_buffer(T_cast_20, [289], dtype="uint8", elem_offset=0, align=128, offset_factor=1) + placeholder_65 = T.match_buffer(placeholder_62, [150528], dtype="int16", elem_offset=0, align=64, offset_factor=1) + placeholder_66 = T.match_buffer(placeholder_63, [9408], dtype="int16", elem_offset=0, align=64, offset_factor=1) + placeholder_67 = T.match_buffer(placeholder_64, [64], dtype="int32", elem_offset=0, align=64, offset_factor=1) + T_cast_21 = T.match_buffer(T_cast_20, [289], dtype="uint8", elem_offset=0, align=64, offset_factor=1) # body PaddedInput_7 = T.allocate([157323], "int16", "global") for i0_i1_fused_7 in T.serial(0, 229): @@ -145,8 +145,8 @@ def tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast(placeholde def tvmgen_default_fused_nn_max_pool2d_cast(placeholder_28: T.handle, T_cast_6: T.handle) -> None: # function attr dict T.func_attr({"global_symbol": "tvmgen_default_fused_nn_max_pool2d_cast", "tir.noalias": True}) - placeholder_29 = T.match_buffer(placeholder_28, [802816], dtype="uint8", elem_offset=0, align=128, offset_factor=1) - T_cast_7 = T.match_buffer(T_cast_6, [177], dtype="int16", elem_offset=0, align=128, offset_factor=1) + placeholder_29 = T.match_buffer(placeholder_28, [802816], dtype="uint8", elem_offset=0, align=64, offset_factor=1) + T_cast_7 = T.match_buffer(T_cast_6, [177], dtype="int16", elem_offset=0, align=64, offset_factor=1) # body tensor_2 = T.allocate([200704], "uint8", "global") for ax0_ax1_fused_4 in T.serial(0, 56): @@ -215,10 +215,10 @@ class ParallelSerialMixedForLoops: def tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast_1(placeholder_68: T.handle, placeholder_69: T.handle, placeholder_70: T.handle, T_cast_22: T.handle) -> None: # function attr dict T.func_attr({"global_symbol": "tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast_1", "tir.noalias": True}) - placeholder_71 = T.match_buffer(placeholder_68, [200704], dtype="int16", elem_offset=0, align=128, offset_factor=1) - placeholder_72 = T.match_buffer(placeholder_69, [110592], dtype="int16", elem_offset=0, align=128, offset_factor=1) - placeholder_73 = T.match_buffer(placeholder_70, [192], dtype="int32", elem_offset=0, align=128, offset_factor=1) - T_cast_23 = T.match_buffer(T_cast_22, [305], dtype="uint8", elem_offset=0, align=128, offset_factor=1) + placeholder_71 = T.match_buffer(placeholder_68, [200704], dtype="int16", elem_offset=0, align=64, offset_factor=1) + placeholder_72 = T.match_buffer(placeholder_69, [110592], dtype="int16", elem_offset=0, align=64, offset_factor=1) + placeholder_73 = T.match_buffer(placeholder_70, [192], dtype="int32", elem_offset=0, align=64, offset_factor=1) + T_cast_23 = T.match_buffer(T_cast_22, [305], dtype="uint8", elem_offset=0, align=64, offset_factor=1) # body PaddedInput_8 = T.allocate([215296], "int16", "global") for i0_i1_fused_8 in T.serial(0, 58): @@ -256,10 +256,10 @@ class AllSerialForLoops: def tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast_1(placeholder_68: T.handle, placeholder_69: T.handle, placeholder_70: T.handle, T_cast_22: T.handle) -> None: # function attr dict T.func_attr({"global_symbol": "tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast_1", "tir.noalias": True}) - placeholder_71 = T.match_buffer(placeholder_68, [200704], dtype="int16", elem_offset=0, align=128, offset_factor=1) - placeholder_72 = T.match_buffer(placeholder_69, [110592], dtype="int16", elem_offset=0, align=128, offset_factor=1) - placeholder_73 = T.match_buffer(placeholder_70, [192], dtype="int32", elem_offset=0, align=128, offset_factor=1) - T_cast_23 = T.match_buffer(T_cast_22, [305], dtype="uint8", elem_offset=0, align=128, offset_factor=1) + placeholder_71 = T.match_buffer(placeholder_68, [200704], dtype="int16", elem_offset=0, align=64, offset_factor=1) + placeholder_72 = T.match_buffer(placeholder_69, [110592], dtype="int16", elem_offset=0, align=64, offset_factor=1) + placeholder_73 = T.match_buffer(placeholder_70, [192], dtype="int32", elem_offset=0, align=64, offset_factor=1) + T_cast_23 = T.match_buffer(T_cast_22, [305], dtype="uint8", elem_offset=0, align=64, offset_factor=1) # body PaddedInput_8 = T.allocate([215296], "int16", "global") for i0_i1_fused_8 in T.serial(0, 58): @@ -338,8 +338,8 @@ class InceptionStructure: def tvmgen_default_fused_nn_max_pool2d(placeholder: T.handle, tensor: T.handle) -> None: # function attr dict T.func_attr({"global_symbol": "tvmgen_default_fused_nn_max_pool2d", "tir.noalias": True}) - placeholder_1 = T.match_buffer(placeholder, [602112], dtype="uint8", elem_offset=0, align=128, offset_factor=1) - tensor_1 = T.match_buffer(tensor, [249], dtype="uint8", elem_offset=0, align=128, offset_factor=1) + placeholder_1 = T.match_buffer(placeholder, [602112], dtype="uint8", elem_offset=0, align=64, offset_factor=1) + tensor_1 = T.match_buffer(tensor, [249], dtype="uint8", elem_offset=0, align=64, offset_factor=1) # body for ax0_ax1_fused in T.serial(0, 28): for ax2 in T.serial(0, 28): @@ -352,9 +352,9 @@ def tvmgen_default_fused_nn_max_pool2d(placeholder: T.handle, tensor: T.handle) def tvmgen_default_fused_cast_subtract(placeholder_2: T.handle, placeholder_3: T.handle, T_subtract: T.handle) -> None: # function attr dict T.func_attr({"global_symbol": "tvmgen_default_fused_cast_subtract", "tir.noalias": True}) - placeholder_4 = T.match_buffer(placeholder_2, [150528], dtype="uint8", elem_offset=0, align=128, offset_factor=1) - placeholder_5 = T.match_buffer(placeholder_3, [1], dtype="int16", elem_offset=0, align=128, offset_factor=1) - T_subtract_1 = T.match_buffer(T_subtract, [452], dtype="int16", elem_offset=0, align=128, offset_factor=1) + placeholder_4 = T.match_buffer(placeholder_2, [150528], dtype="uint8", elem_offset=0, align=64, offset_factor=1) + placeholder_5 = T.match_buffer(placeholder_3, [1], dtype="int16", elem_offset=0, align=64, offset_factor=1) + T_subtract_1 = T.match_buffer(T_subtract, [452], dtype="int16", elem_offset=0, align=64, offset_factor=1) # body for ax0_ax1_fused_1 in T.serial(0, 224): for ax2_1, ax3_inner_1 in T.grid(224, 3): @@ -364,8 +364,8 @@ def tvmgen_default_fused_cast_subtract(placeholder_2: T.handle, placeholder_3: T def tvmgen_default_fused_cast(placeholder_6: T.handle, T_cast: T.handle) -> None: # function attr dict T.func_attr({"global_symbol": "tvmgen_default_fused_cast", "tir.noalias": True}) - placeholder_7 = T.match_buffer(placeholder_6, [150528], dtype="uint8", elem_offset=0, align=128, offset_factor=1) - T_cast_1 = T.match_buffer(T_cast, [249], dtype="int16", elem_offset=0, align=128, offset_factor=1) + placeholder_7 = T.match_buffer(placeholder_6, [150528], dtype="uint8", elem_offset=0, align=64, offset_factor=1) + T_cast_1 = T.match_buffer(T_cast, [249], dtype="int16", elem_offset=0, align=64, offset_factor=1) # body for ax0_ax1_fused_2 in T.serial(0, 28): for ax2_2, ax3_outer_1, ax3_inner_2 in T.grid(28, 12, 16): @@ -375,11 +375,11 @@ def tvmgen_default_fused_cast(placeholder_6: T.handle, T_cast: T.handle) -> None def tvmgen_default_fused_concatenate(placeholder_8: T.handle, placeholder_9: T.handle, placeholder_10: T.handle, placeholder_11: T.handle, T_concat: T.handle) -> None: # function attr dict T.func_attr({"global_symbol": "tvmgen_default_fused_concatenate", "tir.noalias": True}) - placeholder_12 = T.match_buffer(placeholder_8, [50176], dtype="uint8", elem_offset=0, align=128, offset_factor=1) - T_concat_1 = T.match_buffer(T_concat, [313], dtype="uint8", elem_offset=0, align=128, offset_factor=1) - placeholder_13 = T.match_buffer(placeholder_9, [100352], dtype="uint8", elem_offset=0, align=128, offset_factor=1) - placeholder_14 = T.match_buffer(placeholder_11, [25088], dtype="uint8", elem_offset=0, align=128, offset_factor=1) - placeholder_15 = T.match_buffer(placeholder_10, [25088], dtype="uint8", elem_offset=0, align=128, offset_factor=1) + placeholder_12 = T.match_buffer(placeholder_8, [50176], dtype="uint8", elem_offset=0, align=64, offset_factor=1) + T_concat_1 = T.match_buffer(T_concat, [313], dtype="uint8", elem_offset=0, align=64, offset_factor=1) + placeholder_13 = T.match_buffer(placeholder_9, [100352], dtype="uint8", elem_offset=0, align=64, offset_factor=1) + placeholder_14 = T.match_buffer(placeholder_11, [25088], dtype="uint8", elem_offset=0, align=64, offset_factor=1) + placeholder_15 = T.match_buffer(placeholder_10, [25088], dtype="uint8", elem_offset=0, align=64, offset_factor=1) # body for ax0_ax1_fused_3 in T.serial(0, 28): for ax2_3, ax3 in T.grid(28, 256): @@ -389,10 +389,10 @@ def tvmgen_default_fused_concatenate(placeholder_8: T.handle, placeholder_9: T.h def tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast_cast(placeholder_16: T.handle, placeholder_17: T.handle, placeholder_18: T.handle, T_cast_2: T.handle) -> None: # function attr dict T.func_attr({"global_symbol": "tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast_cast", "tir.noalias": True}) - placeholder_19 = T.match_buffer(placeholder_16, [200704], dtype="int16", elem_offset=0, align=128, offset_factor=1) - placeholder_20 = T.match_buffer(placeholder_17, [4096], dtype="int16", elem_offset=0, align=128, offset_factor=1) - placeholder_21 = T.match_buffer(placeholder_18, [64], dtype="int32", elem_offset=0, align=128, offset_factor=1) - T_cast_3 = T.match_buffer(T_cast_2, [177], dtype="int16", elem_offset=0, align=128, offset_factor=1) + placeholder_19 = T.match_buffer(placeholder_16, [200704], dtype="int16", elem_offset=0, align=64, offset_factor=1) + placeholder_20 = T.match_buffer(placeholder_17, [4096], dtype="int16", elem_offset=0, align=64, offset_factor=1) + placeholder_21 = T.match_buffer(placeholder_18, [64], dtype="int32", elem_offset=0, align=64, offset_factor=1) + T_cast_3 = T.match_buffer(T_cast_2, [177], dtype="int16", elem_offset=0, align=64, offset_factor=1) # body PaddedInput = T.allocate([200704], "int16", "global") for i0_i1_fused in T.serial(0, 56): @@ -411,10 +411,10 @@ def tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast_cast(place def tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast_cast_1(placeholder_22: T.handle, placeholder_23: T.handle, placeholder_24: T.handle, T_cast_4: T.handle) -> None: # function attr dict T.func_attr({"global_symbol": "tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast_cast_1", "tir.noalias": True}) - placeholder_25 = T.match_buffer(placeholder_22, [150528], dtype="int16", elem_offset=0, align=128, offset_factor=1) - placeholder_26 = T.match_buffer(placeholder_23, [18432], dtype="int16", elem_offset=0, align=128, offset_factor=1) - placeholder_27 = T.match_buffer(placeholder_24, [96], dtype="int32", elem_offset=0, align=128, offset_factor=1) - T_cast_5 = T.match_buffer(T_cast_4, [153], dtype="int16", elem_offset=0, align=128, offset_factor=1) + placeholder_25 = T.match_buffer(placeholder_22, [150528], dtype="int16", elem_offset=0, align=64, offset_factor=1) + placeholder_26 = T.match_buffer(placeholder_23, [18432], dtype="int16", elem_offset=0, align=64, offset_factor=1) + placeholder_27 = T.match_buffer(placeholder_24, [96], dtype="int32", elem_offset=0, align=64, offset_factor=1) + T_cast_5 = T.match_buffer(T_cast_4, [153], dtype="int16", elem_offset=0, align=64, offset_factor=1) # body PaddedInput_1 = T.allocate([150528], "int16", "global") for i0_i1_fused_1 in T.serial(0, 28): @@ -432,8 +432,8 @@ def tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast_cast_1(pla def tvmgen_default_fused_nn_max_pool2d_cast(placeholder_28: T.handle, T_cast_6: T.handle) -> None: # function attr dict T.func_attr({"global_symbol": "tvmgen_default_fused_nn_max_pool2d_cast", "tir.noalias": True}) - placeholder_29 = T.match_buffer(placeholder_28, [802816], dtype="uint8", elem_offset=0, align=128, offset_factor=1) - T_cast_7 = T.match_buffer(T_cast_6, [177], dtype="int16", elem_offset=0, align=128, offset_factor=1) + placeholder_29 = T.match_buffer(placeholder_28, [802816], dtype="uint8", elem_offset=0, align=64, offset_factor=1) + T_cast_7 = T.match_buffer(T_cast_6, [177], dtype="int16", elem_offset=0, align=64, offset_factor=1) # body tensor_2 = T.allocate([200704], "uint8", "global") for ax0_ax1_fused_4 in T.serial(0, 56): @@ -450,10 +450,10 @@ def tvmgen_default_fused_nn_max_pool2d_cast(placeholder_28: T.handle, T_cast_6: def tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast_2(placeholder_30: T.handle, placeholder_31: T.handle, placeholder_32: T.handle, T_cast_8: T.handle) -> None: # function attr dict T.func_attr({"global_symbol": "tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast_2", "tir.noalias": True}) - placeholder_33 = T.match_buffer(placeholder_30, [150528], dtype="int16", elem_offset=0, align=128, offset_factor=1) - placeholder_34 = T.match_buffer(placeholder_31, [12288], dtype="int16", elem_offset=0, align=128, offset_factor=1) - placeholder_35 = T.match_buffer(placeholder_32, [64], dtype="int32", elem_offset=0, align=128, offset_factor=1) - T_cast_9 = T.match_buffer(T_cast_8, [121], dtype="uint8", elem_offset=0, align=128, offset_factor=1) + placeholder_33 = T.match_buffer(placeholder_30, [150528], dtype="int16", elem_offset=0, align=64, offset_factor=1) + placeholder_34 = T.match_buffer(placeholder_31, [12288], dtype="int16", elem_offset=0, align=64, offset_factor=1) + placeholder_35 = T.match_buffer(placeholder_32, [64], dtype="int32", elem_offset=0, align=64, offset_factor=1) + T_cast_9 = T.match_buffer(T_cast_8, [121], dtype="uint8", elem_offset=0, align=64, offset_factor=1) # body PaddedInput_2 = T.allocate([150528], "int16", "global") for i0_i1_fused_2 in T.serial(0, 28): @@ -472,8 +472,8 @@ def tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast_2(placehol def tvmgen_default_fused_nn_max_pool2d_cast_1(placeholder_36: T.handle, T_cast_10: T.handle) -> None: # function attr dict T.func_attr({"global_symbol": "tvmgen_default_fused_nn_max_pool2d_cast_1", "tir.noalias": True}) - placeholder_37 = T.match_buffer(placeholder_36, [150528], dtype="uint8", elem_offset=0, align=128, offset_factor=1) - T_cast_11 = T.match_buffer(T_cast_10, [249], dtype="int16", elem_offset=0, align=128, offset_factor=1) + placeholder_37 = T.match_buffer(placeholder_36, [150528], dtype="uint8", elem_offset=0, align=64, offset_factor=1) + T_cast_11 = T.match_buffer(T_cast_10, [249], dtype="int16", elem_offset=0, align=64, offset_factor=1) # body tensor_3 = T.allocate([150528], "uint8", "global") for ax0_ax1_fused_6 in T.serial(0, 28): @@ -490,10 +490,10 @@ def tvmgen_default_fused_nn_max_pool2d_cast_1(placeholder_36: T.handle, T_cast_1 def tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast_cast_fixed_point_multiply_cli_4464294615199028320__2(placeholder_38: T.handle, placeholder_39: T.handle, placeholder_40: T.handle, T_cast_12: T.handle) -> None: # function attr dict T.func_attr({"global_symbol": "tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast_cast_fixed_point_multiply_cli_4464294615199028320__2", "tir.noalias": True}) - placeholder_41 = T.match_buffer(placeholder_38, [150528], dtype="int16", elem_offset=0, align=128, offset_factor=1) - placeholder_42 = T.match_buffer(placeholder_39, [6144], dtype="int16", elem_offset=0, align=128, offset_factor=1) - placeholder_43 = T.match_buffer(placeholder_40, [32], dtype="int32", elem_offset=0, align=128, offset_factor=1) - T_cast_13 = T.match_buffer(T_cast_12, [89], dtype="uint8", elem_offset=0, align=128, offset_factor=1) + placeholder_41 = T.match_buffer(placeholder_38, [150528], dtype="int16", elem_offset=0, align=64, offset_factor=1) + placeholder_42 = T.match_buffer(placeholder_39, [6144], dtype="int16", elem_offset=0, align=64, offset_factor=1) + placeholder_43 = T.match_buffer(placeholder_40, [32], dtype="int32", elem_offset=0, align=64, offset_factor=1) + T_cast_13 = T.match_buffer(T_cast_12, [89], dtype="uint8", elem_offset=0, align=64, offset_factor=1) # body PaddedInput_3 = T.allocate([150528], "int16", "global") for i0_i1_fused_3 in T.serial(0, 28): @@ -511,10 +511,10 @@ def tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast_cast_fixed def tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast_cast_2(placeholder_44: T.handle, placeholder_45: T.handle, placeholder_46: T.handle, T_cast_14: T.handle) -> None: # function attr dict T.func_attr({"global_symbol": "tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast_cast_2", "tir.noalias": True}) - placeholder_47 = T.match_buffer(placeholder_44, [150528], dtype="int16", elem_offset=0, align=128, offset_factor=1) - placeholder_48 = T.match_buffer(placeholder_45, [3072], dtype="int16", elem_offset=0, align=128, offset_factor=1) - placeholder_49 = T.match_buffer(placeholder_46, [16], dtype="int32", elem_offset=0, align=128, offset_factor=1) - T_cast_15 = T.match_buffer(T_cast_14, [73], dtype="int16", elem_offset=0, align=128, offset_factor=1) + placeholder_47 = T.match_buffer(placeholder_44, [150528], dtype="int16", elem_offset=0, align=64, offset_factor=1) + placeholder_48 = T.match_buffer(placeholder_45, [3072], dtype="int16", elem_offset=0, align=64, offset_factor=1) + placeholder_49 = T.match_buffer(placeholder_46, [16], dtype="int32", elem_offset=0, align=64, offset_factor=1) + T_cast_15 = T.match_buffer(T_cast_14, [73], dtype="int16", elem_offset=0, align=64, offset_factor=1) # body PaddedInput_4 = T.allocate([150528], "int16", "global") for i0_i1_fused_4 in T.serial(0, 28): @@ -532,10 +532,10 @@ def tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast_cast_2(pla def tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast_cast_fixed_point_multiply_cli_4464294615199028320__1(placeholder_50: T.handle, placeholder_51: T.handle, placeholder_52: T.handle, T_cast_16: T.handle) -> None: # function attr dict T.func_attr({"global_symbol": "tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast_cast_fixed_point_multiply_cli_4464294615199028320__1", "tir.noalias": True}) - placeholder_53 = T.match_buffer(placeholder_50, [12544], dtype="int16", elem_offset=0, align=128, offset_factor=1) - placeholder_54 = T.match_buffer(placeholder_51, [4608], dtype="int16", elem_offset=0, align=128, offset_factor=1) - placeholder_55 = T.match_buffer(placeholder_52, [32], dtype="int32", elem_offset=0, align=128, offset_factor=1) - T_cast_17 = T.match_buffer(T_cast_16, [89], dtype="uint8", elem_offset=0, align=128, offset_factor=1) + placeholder_53 = T.match_buffer(placeholder_50, [12544], dtype="int16", elem_offset=0, align=64, offset_factor=1) + placeholder_54 = T.match_buffer(placeholder_51, [4608], dtype="int16", elem_offset=0, align=64, offset_factor=1) + placeholder_55 = T.match_buffer(placeholder_52, [32], dtype="int32", elem_offset=0, align=64, offset_factor=1) + T_cast_17 = T.match_buffer(T_cast_16, [89], dtype="uint8", elem_offset=0, align=64, offset_factor=1) # body PaddedInput_5 = T.allocate([14400], "int16", "global") for i0_i1_fused_5 in T.serial(0, 30): @@ -553,10 +553,10 @@ def tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast_cast_fixed def tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast_cast_fixed_point_multiply_cli_4464294615199028320_(placeholder_56: T.handle, placeholder_57: T.handle, placeholder_58: T.handle, T_cast_18: T.handle) -> None: # function attr dict T.func_attr({"global_symbol": "tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast_cast_fixed_point_multiply_cli_4464294615199028320_", "tir.noalias": True}) - placeholder_59 = T.match_buffer(placeholder_56, [75264], dtype="int16", elem_offset=0, align=128, offset_factor=1) - placeholder_60 = T.match_buffer(placeholder_57, [110592], dtype="int16", elem_offset=0, align=128, offset_factor=1) - placeholder_61 = T.match_buffer(placeholder_58, [128], dtype="int32", elem_offset=0, align=128, offset_factor=1) - T_cast_19 = T.match_buffer(T_cast_18, [185], dtype="uint8", elem_offset=0, align=128, offset_factor=1) + placeholder_59 = T.match_buffer(placeholder_56, [75264], dtype="int16", elem_offset=0, align=64, offset_factor=1) + placeholder_60 = T.match_buffer(placeholder_57, [110592], dtype="int16", elem_offset=0, align=64, offset_factor=1) + placeholder_61 = T.match_buffer(placeholder_58, [128], dtype="int32", elem_offset=0, align=64, offset_factor=1) + T_cast_19 = T.match_buffer(T_cast_18, [185], dtype="uint8", elem_offset=0, align=64, offset_factor=1) # body PaddedInput_6 = T.allocate([86400], "int16", "global") for i0_i1_fused_6 in T.serial(0, 30): @@ -576,10 +576,10 @@ def tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast_cast_fixed def tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast(placeholder_62: T.handle, placeholder_63: T.handle, placeholder_64: T.handle, T_cast_20: T.handle) -> None: # function attr dict T.func_attr({"global_symbol": "tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast", "T.noalias": True}) - placeholder_65 = T.match_buffer(placeholder_62, [150528], dtype="int16", elem_offset=0, align=128, offset_factor=1) - placeholder_66 = T.match_buffer(placeholder_63, [9408], dtype="int16", elem_offset=0, align=128, offset_factor=1) - placeholder_67 = T.match_buffer(placeholder_64, [64], dtype="int32", elem_offset=0, align=128, offset_factor=1) - T_cast_21 = T.match_buffer(T_cast_20, [289], dtype="uint8", elem_offset=0, align=128, offset_factor=1) + placeholder_65 = T.match_buffer(placeholder_62, [150528], dtype="int16", elem_offset=0, align=64, offset_factor=1) + placeholder_66 = T.match_buffer(placeholder_63, [9408], dtype="int16", elem_offset=0, align=64, offset_factor=1) + placeholder_67 = T.match_buffer(placeholder_64, [64], dtype="int32", elem_offset=0, align=64, offset_factor=1) + T_cast_21 = T.match_buffer(T_cast_20, [289], dtype="uint8", elem_offset=0, align=64, offset_factor=1) # body PaddedInput_7 = T.allocate([157323], "int16", "global") for i0_i1_fused_7 in T.serial(0, 229): @@ -598,10 +598,10 @@ def tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast(placeholde def tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast_1(placeholder_68: T.handle, placeholder_69: T.handle, placeholder_70: T.handle, T_cast_22: T.handle) -> None: # function attr dict T.func_attr({"global_symbol": "tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast_1", "tir.noalias": True}) - placeholder_71 = T.match_buffer(placeholder_68, [200704], dtype="int16", elem_offset=0, align=128, offset_factor=1) - placeholder_72 = T.match_buffer(placeholder_69, [110592], dtype="int16", elem_offset=0, align=128, offset_factor=1) - placeholder_73 = T.match_buffer(placeholder_70, [192], dtype="int32", elem_offset=0, align=128, offset_factor=1) - T_cast_23 = T.match_buffer(T_cast_22, [305], dtype="uint8", elem_offset=0, align=128, offset_factor=1) + placeholder_71 = T.match_buffer(placeholder_68, [200704], dtype="int16", elem_offset=0, align=64, offset_factor=1) + placeholder_72 = T.match_buffer(placeholder_69, [110592], dtype="int16", elem_offset=0, align=64, offset_factor=1) + placeholder_73 = T.match_buffer(placeholder_70, [192], dtype="int32", elem_offset=0, align=64, offset_factor=1) + T_cast_23 = T.match_buffer(T_cast_22, [305], dtype="uint8", elem_offset=0, align=64, offset_factor=1) # body PaddedInput_8 = T.allocate([215296], "int16", "global") for i0_i1_fused_8 in T.serial(0, 58): diff --git a/tests/python/unittest/test_tir_usmp_transform_convert_pool_allocations_to_offsets.py b/tests/python/unittest/test_tir_usmp_transform_convert_pool_allocations_to_offsets.py index 0a3e39b52f46..e6d123118757 100644 --- a/tests/python/unittest/test_tir_usmp_transform_convert_pool_allocations_to_offsets.py +++ b/tests/python/unittest/test_tir_usmp_transform_convert_pool_allocations_to_offsets.py @@ -74,12 +74,12 @@ class LinearStructure: def tvmgen_default_fused_cast_subtract(placeholder_2: T.handle, placeholder_3: T.handle, T_subtract: T.handle) -> None: # function attr dict T.func_attr({"global_symbol": "tvmgen_default_fused_cast_subtract", "tir.noalias": True}) - placeholder_4 = T.match_buffer(placeholder_2, [150528], dtype="uint8", elem_offset=0, align=128, offset_factor=1) - T.preflattened_buffer(placeholder_4, [150528], dtype="uint8", elem_offset=0, align=128, offset_factor=1) - placeholder_5 = T.match_buffer(placeholder_3, [1], dtype="int16", elem_offset=0, align=128, offset_factor=1) - T.preflattened_buffer(placeholder_5, [1], dtype="int16", elem_offset=0, align=128, offset_factor=1) - T_subtract_1 = T.match_buffer(T_subtract, [452], dtype="int16", elem_offset=0, align=128, offset_factor=1) - T.preflattened_buffer(T_subtract_1, [452], dtype="int16", elem_offset=0, align=128, offset_factor=1) + placeholder_4 = T.match_buffer(placeholder_2, [150528], dtype="uint8", elem_offset=0, align=64, offset_factor=1) + T.preflattened_buffer(placeholder_4, [150528], dtype="uint8", elem_offset=0, align=64, offset_factor=1) + placeholder_5 = T.match_buffer(placeholder_3, [1], dtype="int16", elem_offset=0, align=64, offset_factor=1) + T.preflattened_buffer(placeholder_5, [1], dtype="int16", elem_offset=0, align=64, offset_factor=1) + T_subtract_1 = T.match_buffer(T_subtract, [452], dtype="int16", elem_offset=0, align=64, offset_factor=1) + T.preflattened_buffer(T_subtract_1, [452], dtype="int16", elem_offset=0, align=64, offset_factor=1) # body for ax0_ax1_fused_1 in T.serial(0, 224): for ax2_1, ax3_inner_1 in T.grid(224, 3): @@ -89,14 +89,14 @@ def tvmgen_default_fused_cast_subtract(placeholder_2: T.handle, placeholder_3: T def tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast(placeholder_62: T.handle, placeholder_63: T.handle, placeholder_64: T.handle, T_cast_20: T.handle) -> None: # function attr dict T.func_attr({"global_symbol": "tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast", "tir.noalias": True}) - placeholder_65 = T.match_buffer(placeholder_62, [150528], dtype="int16", elem_offset=0, align=128, offset_factor=1) - T.preflattened_buffer(placeholder_65, [150528], dtype="int16", elem_offset=0, align=128, offset_factor=1) - placeholder_66 = T.match_buffer(placeholder_63, [9408], dtype="int16", elem_offset=0, align=128, offset_factor=1) - T.preflattened_buffer(placeholder_66, [9408], dtype="int16", elem_offset=0, align=128, offset_factor=1) - placeholder_67 = T.match_buffer(placeholder_64, [64], dtype="int32", elem_offset=0, align=128, offset_factor=1) - T.preflattened_buffer(placeholder_67, [64], dtype="int32", elem_offset=0, align=128, offset_factor=1) - T_cast_21 = T.match_buffer(T_cast_20, [289], dtype="uint8", elem_offset=0, align=128, offset_factor=1) - T.preflattened_buffer(T_cast_21, [289], dtype="uint8", elem_offset=0, align=128, offset_factor=1) + placeholder_65 = T.match_buffer(placeholder_62, [150528], dtype="int16", elem_offset=0, align=64, offset_factor=1) + T.preflattened_buffer(placeholder_65, [150528], dtype="int16", elem_offset=0, align=64, offset_factor=1) + placeholder_66 = T.match_buffer(placeholder_63, [9408], dtype="int16", elem_offset=0, align=64, offset_factor=1) + T.preflattened_buffer(placeholder_66, [9408], dtype="int16", elem_offset=0, align=64, offset_factor=1) + placeholder_67 = T.match_buffer(placeholder_64, [64], dtype="int32", elem_offset=0, align=64, offset_factor=1) + T.preflattened_buffer(placeholder_67, [64], dtype="int32", elem_offset=0, align=64, offset_factor=1) + T_cast_21 = T.match_buffer(T_cast_20, [289], dtype="uint8", elem_offset=0, align=64, offset_factor=1) + T.preflattened_buffer(T_cast_21, [289], dtype="uint8", elem_offset=0, align=64, offset_factor=1) # body PaddedInput_7 = T.allocate([157323], "int16", "global") for i0_i1_fused_7 in T.serial(0, 229): @@ -115,10 +115,10 @@ def tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast(placeholde def tvmgen_default_fused_nn_max_pool2d_cast(placeholder_28: T.handle, T_cast_6: T.handle) -> None: # function attr dict T.func_attr({"global_symbol": "tvmgen_default_fused_nn_max_pool2d_cast", "tir.noalias": True}) - placeholder_29 = T.match_buffer(placeholder_28, [802816], dtype="uint8", elem_offset=0, align=128, offset_factor=1) - T.preflattened_buffer(placeholder_29, [802816], dtype="uint8", elem_offset=0, align=128, offset_factor=1) - T_cast_7 = T.match_buffer(T_cast_6, [177], dtype="int16", elem_offset=0, align=128, offset_factor=1) - T.preflattened_buffer(T_cast_7, [177], dtype="int16", elem_offset=0, align=128, offset_factor=1) + placeholder_29 = T.match_buffer(placeholder_28, [802816], dtype="uint8", elem_offset=0, align=64, offset_factor=1) + T.preflattened_buffer(placeholder_29, [802816], dtype="uint8", elem_offset=0, align=64, offset_factor=1) + T_cast_7 = T.match_buffer(T_cast_6, [177], dtype="int16", elem_offset=0, align=64, offset_factor=1) + T.preflattened_buffer(T_cast_7, [177], dtype="int16", elem_offset=0, align=64, offset_factor=1) # body tensor_2 = T.allocate([200704], "uint8", "global") for ax0_ax1_fused_4 in T.serial(0, 56): diff --git a/tests/python/unittest/test_tir_usmp_transform_create_io_allocates.py b/tests/python/unittest/test_tir_usmp_transform_create_io_allocates.py index d72cb7f72ede..53a381c82b14 100644 --- a/tests/python/unittest/test_tir_usmp_transform_create_io_allocates.py +++ b/tests/python/unittest/test_tir_usmp_transform_create_io_allocates.py @@ -28,9 +28,9 @@ class SingleInputSingleOutput: def tvmgen_default_fused_cast_subtract(placeholder_2: T.handle, placeholder_3: T.handle, T_subtract: T.handle) -> None: # function attr dict T.func_attr({"global_symbol": "tvmgen_default_fused_cast_subtract", "tir.noalias": True}) - placeholder_4 = T.match_buffer(placeholder_2, [150528], dtype="uint8", elem_offset=0, align=128, offset_factor=1) - placeholder_5 = T.match_buffer(placeholder_3, [1], dtype="int16", elem_offset=0, align=128, offset_factor=1) - T_subtract_1 = T.match_buffer(T_subtract, [452], dtype="int16", elem_offset=0, align=128, offset_factor=1) + placeholder_4 = T.match_buffer(placeholder_2, [150528], dtype="uint8", elem_offset=0, align=64, offset_factor=1) + placeholder_5 = T.match_buffer(placeholder_3, [1], dtype="int16", elem_offset=0, align=64, offset_factor=1) + T_subtract_1 = T.match_buffer(T_subtract, [452], dtype="int16", elem_offset=0, align=64, offset_factor=1) # body for ax0_ax1_fused_1 in T.serial(0, 224): for ax2_1, ax3_inner_1 in T.grid(224, 3): @@ -40,8 +40,8 @@ def tvmgen_default_fused_cast_subtract(placeholder_2: T.handle, placeholder_3: T def __tvm_main__(input: T.handle, output: T.handle) -> None: # function attr dict T.func_attr({"global_symbol": "__tvm_main__", "runner_function": True}) - input_buffer_var = T.match_buffer(input, [150528], dtype="uint8", elem_offset=0, align=128, offset_factor=1) - output_buffer_var = T.match_buffer(output, [452], dtype="int16", elem_offset=0, align=128, offset_factor=1) + input_buffer_var = T.match_buffer(input, [150528], dtype="uint8", elem_offset=0, align=64, offset_factor=1) + output_buffer_var = T.match_buffer(output, [452], dtype="int16", elem_offset=0, align=64, offset_factor=1) # body T.evaluate(T.call_extern("tvmgen_default_fused_cast_subtract", input_buffer_var.data, T.lookup_param("p0", dtype="handle"), output_buffer_var.data, dtype="int32")) # fmt: on @@ -54,9 +54,9 @@ class TwoInputSingleOutput: def tvmgen_default_fused_cast_subtract(placeholder_2: T.handle, placeholder_3: T.handle, T_subtract: T.handle) -> None: # function attr dict T.func_attr({"global_symbol": "tvmgen_default_fused_cast_subtract", "tir.noalias": True}) - placeholder_4 = T.match_buffer(placeholder_2, [150528], dtype="uint8", elem_offset=0, align=128, offset_factor=1) - placeholder_5 = T.match_buffer(placeholder_3, [1], dtype="int16", elem_offset=0, align=128, offset_factor=1) - T_subtract_1 = T.match_buffer(T_subtract, [452], dtype="int16", elem_offset=0, align=128, offset_factor=1) + placeholder_4 = T.match_buffer(placeholder_2, [150528], dtype="uint8", elem_offset=0, align=64, offset_factor=1) + placeholder_5 = T.match_buffer(placeholder_3, [1], dtype="int16", elem_offset=0, align=64, offset_factor=1) + T_subtract_1 = T.match_buffer(T_subtract, [452], dtype="int16", elem_offset=0, align=64, offset_factor=1) # body for ax0_ax1_fused_1 in T.serial(0, 224): for ax2_1, ax3_inner_1 in T.grid(224, 3): @@ -66,9 +66,9 @@ def tvmgen_default_fused_cast_subtract(placeholder_2: T.handle, placeholder_3: T def __tvm_main__(input1: T.handle, input2: T.handle, output: T.handle) -> None: # function attr dict T.func_attr({"global_symbol": "__tvm_main__", "runner_function": True}) - input1_buffer_var = T.match_buffer(input1, [150528], dtype="uint8", elem_offset=0, align=128, offset_factor=1) - input2_buffer_var = T.match_buffer(input2, [1], dtype="int16", elem_offset=0, align=128, offset_factor=1) - output_buffer_var = T.match_buffer(output, [452], dtype="int16", elem_offset=0, align=128, offset_factor=1) + input1_buffer_var = T.match_buffer(input1, [150528], dtype="uint8", elem_offset=0, align=64, offset_factor=1) + input2_buffer_var = T.match_buffer(input2, [1], dtype="int16", elem_offset=0, align=64, offset_factor=1) + output_buffer_var = T.match_buffer(output, [452], dtype="int16", elem_offset=0, align=64, offset_factor=1) # body T.evaluate(T.call_extern("tvmgen_default_fused_cast_subtract", input1_buffer_var.data, input2_buffer_var.data, output_buffer_var.data, dtype="int32")) # fmt: on @@ -81,9 +81,9 @@ class TwoInputTwoOutput: def tvmgen_default_fused_cast_subtract(placeholder_2: T.handle, placeholder_3: T.handle, T_subtract: T.handle) -> None: # function attr dict T.func_attr({"global_symbol": "tvmgen_default_fused_cast_subtract", "tir.noalias": True}) - placeholder_4 = T.match_buffer(placeholder_2, [150528], dtype="uint8", elem_offset=0, align=128, offset_factor=1) - placeholder_5 = T.match_buffer(placeholder_3, [1], dtype="int16", elem_offset=0, align=128, offset_factor=1) - T_subtract_1 = T.match_buffer(T_subtract, [452], dtype="int16", elem_offset=0, align=128, offset_factor=1) + placeholder_4 = T.match_buffer(placeholder_2, [150528], dtype="uint8", elem_offset=0, align=64, offset_factor=1) + placeholder_5 = T.match_buffer(placeholder_3, [1], dtype="int16", elem_offset=0, align=64, offset_factor=1) + T_subtract_1 = T.match_buffer(T_subtract, [452], dtype="int16", elem_offset=0, align=64, offset_factor=1) # body for ax0_ax1_fused_1 in T.serial(0, 224): for ax2_1, ax3_inner_1 in T.grid(224, 3): @@ -93,10 +93,10 @@ def tvmgen_default_fused_cast_subtract(placeholder_2: T.handle, placeholder_3: T def __tvm_main__(input1: T.handle, input2: T.handle, output1: T.handle, output2: T.handle) -> None: # function attr dict T.func_attr({"global_symbol": "__tvm_main__", "runner_function": True}) - input1_buffer_var = T.match_buffer(input1, [150528], dtype="uint8", elem_offset=0, align=128, offset_factor=1) - input2_buffer_var = T.match_buffer(input2, [150528], dtype="uint8", elem_offset=0, align=128, offset_factor=1) - output1_buffer_var = T.match_buffer(output1, [452], dtype="int16", elem_offset=0, align=128, offset_factor=1) - output2_buffer_var = T.match_buffer(output2, [452], dtype="int16", elem_offset=0, align=128, offset_factor=1) + input1_buffer_var = T.match_buffer(input1, [150528], dtype="uint8", elem_offset=0, align=64, offset_factor=1) + input2_buffer_var = T.match_buffer(input2, [150528], dtype="uint8", elem_offset=0, align=64, offset_factor=1) + output1_buffer_var = T.match_buffer(output1, [452], dtype="int16", elem_offset=0, align=64, offset_factor=1) + output2_buffer_var = T.match_buffer(output2, [452], dtype="int16", elem_offset=0, align=64, offset_factor=1) # body T.evaluate(T.call_extern("tvmgen_default_fused_cast_subtract", input1_buffer_var.data, T.lookup_param("p0", dtype="handle"), output1_buffer_var.data, dtype="int32")) T.evaluate(T.call_extern("tvmgen_default_fused_cast_subtract", input2_buffer_var.data, T.lookup_param("p1", dtype="handle"), output2_buffer_var.data, dtype="int32")) @@ -110,9 +110,9 @@ class SingleInputTwoOutput: def tvmgen_default_fused_cast_subtract(placeholder_2: T.handle, placeholder_3: T.handle, T_subtract: T.handle) -> None: # function attr dict T.func_attr({"global_symbol": "tvmgen_default_fused_cast_subtract", "tir.noalias": True}) - placeholder_4 = T.match_buffer(placeholder_2, [150528], dtype="uint8", elem_offset=0, align=128, offset_factor=1) - placeholder_5 = T.match_buffer(placeholder_3, [1], dtype="int16", elem_offset=0, align=128, offset_factor=1) - T_subtract_1 = T.match_buffer(T_subtract, [452], dtype="int16", elem_offset=0, align=128, offset_factor=1) + placeholder_4 = T.match_buffer(placeholder_2, [150528], dtype="uint8", elem_offset=0, align=64, offset_factor=1) + placeholder_5 = T.match_buffer(placeholder_3, [1], dtype="int16", elem_offset=0, align=64, offset_factor=1) + T_subtract_1 = T.match_buffer(T_subtract, [452], dtype="int16", elem_offset=0, align=64, offset_factor=1) # body for ax0_ax1_fused_1 in T.serial(0, 224): for ax2_1, ax3_inner_1 in T.grid(224, 3): @@ -122,9 +122,9 @@ def tvmgen_default_fused_cast_subtract(placeholder_2: T.handle, placeholder_3: T def __tvm_main__(input: T.handle, output1: T.handle, output2: T.handle) -> None: # function attr dict T.func_attr({"global_symbol": "__tvm_main__", "runner_function": True}) - input_buffer_var = T.match_buffer(input, [150528], dtype="uint8", elem_offset=0, align=128, offset_factor=1) - output1_buffer_var = T.match_buffer(output1, [452], dtype="int16", elem_offset=0, align=128, offset_factor=1) - output2_buffer_var = T.match_buffer(output2, [452], dtype="int16", elem_offset=0, align=128, offset_factor=1) + input_buffer_var = T.match_buffer(input, [150528], dtype="uint8", elem_offset=0, align=64, offset_factor=1) + output1_buffer_var = T.match_buffer(output1, [452], dtype="int16", elem_offset=0, align=64, offset_factor=1) + output2_buffer_var = T.match_buffer(output2, [452], dtype="int16", elem_offset=0, align=64, offset_factor=1) # body T.evaluate(T.call_extern("tvmgen_default_fused_cast_subtract", input_buffer_var.data, T.lookup_param("p0", dtype="handle"), output1_buffer_var.data, dtype="int32")) T.evaluate(T.call_extern("tvmgen_default_fused_cast_subtract", input_buffer_var.data, T.lookup_param("p1", dtype="handle"), output2_buffer_var.data, dtype="int32")) diff --git a/tests/python/unittest/test_tir_usmp_utils.py b/tests/python/unittest/test_tir_usmp_utils.py index 6e53bcb5e597..155ff0962def 100644 --- a/tests/python/unittest/test_tir_usmp_utils.py +++ b/tests/python/unittest/test_tir_usmp_utils.py @@ -31,9 +31,9 @@ class LinearStructure: def tvmgen_default_fused_cast_subtract(placeholder_2: T.handle, placeholder_3: T.handle, T_subtract: T.handle) -> None: # function attr dict T.func_attr({"global_symbol": "tvmgen_default_fused_cast_subtract", "tir.noalias": True}) - placeholder_4 = T.match_buffer(placeholder_2, [150528], dtype="uint8", elem_offset=0, align=128, offset_factor=1) - placeholder_5 = T.match_buffer(placeholder_3, [1], dtype="int16", elem_offset=0, align=128, offset_factor=1) - T_subtract_1 = T.match_buffer(T_subtract, [150528], dtype="int16", elem_offset=0, align=128, offset_factor=1) + placeholder_4 = T.match_buffer(placeholder_2, [150528], dtype="uint8", elem_offset=0, align=64, offset_factor=1) + placeholder_5 = T.match_buffer(placeholder_3, [1], dtype="int16", elem_offset=0, align=64, offset_factor=1) + T_subtract_1 = T.match_buffer(T_subtract, [150528], dtype="int16", elem_offset=0, align=64, offset_factor=1) # body for ax0_ax1_fused_1 in T.serial(0, 224): for ax2_1, ax3_inner_1 in T.grid(224, 3): @@ -43,10 +43,10 @@ def tvmgen_default_fused_cast_subtract(placeholder_2: T.handle, placeholder_3: T def tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast(placeholder_62: T.handle, placeholder_63: T.handle, placeholder_64: T.handle, T_cast_20: T.handle) -> None: # function attr dict T.func_attr({"global_symbol": "tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast", "tir.noalias": True}) - placeholder_65 = T.match_buffer(placeholder_62, [150528], dtype="int16", elem_offset=0, align=128, offset_factor=1) - placeholder_66 = T.match_buffer(placeholder_63, [9408], dtype="int16", elem_offset=0, align=128, offset_factor=1) - placeholder_67 = T.match_buffer(placeholder_64, [64], dtype="int32", elem_offset=0, align=128, offset_factor=1) - T_cast_21 = T.match_buffer(T_cast_20, [289], dtype="uint8", elem_offset=0, align=128, offset_factor=1) + placeholder_65 = T.match_buffer(placeholder_62, [150528], dtype="int16", elem_offset=0, align=64, offset_factor=1) + placeholder_66 = T.match_buffer(placeholder_63, [9408], dtype="int16", elem_offset=0, align=64, offset_factor=1) + placeholder_67 = T.match_buffer(placeholder_64, [64], dtype="int32", elem_offset=0, align=64, offset_factor=1) + T_cast_21 = T.match_buffer(T_cast_20, [289], dtype="uint8", elem_offset=0, align=64, offset_factor=1) # body PaddedInput_7 = T.allocate([157323], "int16", "global") for i0_i1_fused_7 in T.serial(0, 229): @@ -65,8 +65,8 @@ def tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast(placeholde def tvmgen_default_fused_nn_max_pool2d_cast(placeholder_28: T.handle, T_cast_6: T.handle) -> None: # function attr dict T.func_attr({"global_symbol": "tvmgen_default_fused_nn_max_pool2d_cast", "tir.noalias": True}) - placeholder_29 = T.match_buffer(placeholder_28, [802816], dtype="uint8", elem_offset=0, align=128, offset_factor=1) - T_cast_7 = T.match_buffer(T_cast_6, [177], dtype="int16", elem_offset=0, align=128, offset_factor=1) + placeholder_29 = T.match_buffer(placeholder_28, [802816], dtype="uint8", elem_offset=0, align=64, offset_factor=1) + T_cast_7 = T.match_buffer(T_cast_6, [177], dtype="int16", elem_offset=0, align=64, offset_factor=1) # body tensor_2 = T.allocate([200704], "uint8", "global") for ax0_ax1_fused_4 in T.serial(0, 56): diff --git a/tests/python/unittest/test_tvmscript_complete.py b/tests/python/unittest/test_tvmscript_complete.py index c4b4afb24f82..29ac5dc5da0d 100644 --- a/tests/python/unittest/test_tvmscript_complete.py +++ b/tests/python/unittest/test_tvmscript_complete.py @@ -201,12 +201,12 @@ def func_with_bufferslice_indices(data: T.handle, index: T.handle) -> None: @T.prim_func def expected_bufferslice_indices(data: T.handle, index: T.handle) -> None: - index_buf = T.match_buffer(index, [1], dtype="int32", elem_offset=0, align=128, offset_factor=1) - data_buf = T.match_buffer(data, [16, 16], elem_offset=0, align=128, offset_factor=1) + index_buf = T.match_buffer(index, [1], dtype="int32", elem_offset=0, align=64, offset_factor=1) + data_buf = T.match_buffer(data, [16, 16], elem_offset=0, align=64, offset_factor=1) with T.block("root"): T.reads([]) T.writes([]) - out_buf = T.alloc_buffer([16, 16], elem_offset=0, align=128, offset_factor=1) + out_buf = T.alloc_buffer([16, 16], elem_offset=0, align=64, offset_factor=1) for i0, i1 in T.grid(16, 16): with T.block(): vi, vj = T.axis.remap("SS", [i0, i1]) @@ -229,12 +229,12 @@ def func_with_recursive_bufferslice_indices(data: T.handle, index: T.handle) -> @T.prim_func def expected_recursive_bufferslice_indices(data: T.handle, index: T.handle) -> None: - index_buf = T.match_buffer(index, [1], dtype="int32", elem_offset=0, align=128, offset_factor=1) - data_buf = T.match_buffer(data, [16, 16], elem_offset=0, align=128, offset_factor=1) + index_buf = T.match_buffer(index, [1], dtype="int32", elem_offset=0, align=64, offset_factor=1) + data_buf = T.match_buffer(data, [16, 16], elem_offset=0, align=64, offset_factor=1) with T.block("root"): T.reads([]) T.writes([]) - out_buf = T.alloc_buffer([16, 16], elem_offset=0, align=128, offset_factor=1) + out_buf = T.alloc_buffer([16, 16], elem_offset=0, align=64, offset_factor=1) for i0, i1 in T.grid(16, 16): with T.block(): vi, vj = T.axis.remap("SS", [i0, i1]) @@ -303,12 +303,12 @@ def alloc_buffer_func(a: T.handle, b: T.handle) -> None: @T.prim_func def expect_alloc_buffer_func(a: T.handle, b: T.handle) -> None: - A = T.match_buffer(a, [2, 2], dtype="float32", elem_offset=0, align=128, offset_factor=1) - B = T.match_buffer(b, [2, 2], dtype="float32", elem_offset=0, align=128, offset_factor=1) + A = T.match_buffer(a, [2, 2], dtype="float32", elem_offset=0, align=64, offset_factor=1) + B = T.match_buffer(b, [2, 2], dtype="float32", elem_offset=0, align=64, offset_factor=1) with T.block("root"): T.reads([]) T.writes([]) - C = T.alloc_buffer([2, 2], dtype="float32", elem_offset=0, align=128, offset_factor=1) + C = T.alloc_buffer([2, 2], dtype="float32", elem_offset=0, align=64, offset_factor=1) A[(0, 0)] = T.float32(2) C[(0, 0)] = A[(0, 0)] + B[(0, 0)] B[(0, 0)] = C[(0, 0)] diff --git a/tests/python/unittest/test_tvmscript_roundtrip.py b/tests/python/unittest/test_tvmscript_roundtrip.py index e5f5ae752aac..e98f5057d8c4 100644 --- a/tests/python/unittest/test_tvmscript_roundtrip.py +++ b/tests/python/unittest/test_tvmscript_roundtrip.py @@ -34,11 +34,11 @@ def mmult(A: T.handle, B: T.handle, C: T.handle) -> None: # function attr dict T.func_attr({"global_symbol": "mmult", "tir.noalias": True}) # buffer definition - C_global = T.buffer_decl([1024, 1024], elem_offset=0, align=128, offset_factor=1) - packedB = T.buffer_decl([32, 1024, 32], elem_offset=0, align=128, offset_factor=1) - A_1 = T.match_buffer(A, [1024, 1024], elem_offset=0, align=128, offset_factor=1) - B_1 = T.match_buffer(B, [1024, 1024], elem_offset=0, align=128, offset_factor=1) - C_1 = T.match_buffer(C, [1024, 1024], elem_offset=0, align=128, offset_factor=1) + C_global = T.buffer_decl([1024, 1024], elem_offset=0, align=64, offset_factor=1) + packedB = T.buffer_decl([32, 1024, 32], elem_offset=0, align=64, offset_factor=1) + A_1 = T.match_buffer(A, [1024, 1024], elem_offset=0, align=64, offset_factor=1) + B_1 = T.match_buffer(B, [1024, 1024], elem_offset=0, align=64, offset_factor=1) + C_1 = T.match_buffer(C, [1024, 1024], elem_offset=0, align=64, offset_factor=1) # body T.realize(packedB[0:32, 0:1024, 0:32], "") for x in T.parallel(0, 32): @@ -90,9 +90,9 @@ class Module: def mmult(A: T.handle, B: T.handle, C: T.handle) -> None: # function attr dict T.func_attr({"global_symbol": "mmult", "tir.noalias": True}) - A_1 = T.match_buffer(A, [1024 * 1024], elem_offset=0, align=128, offset_factor=1) - B_1 = T.match_buffer(B, [1024, 1024], elem_offset=0, align=128, offset_factor=1) - C_1 = T.match_buffer(C, [1024 * 1024], elem_offset=0, align=128, offset_factor=1) + A_1 = T.match_buffer(A, [1024 * 1024], elem_offset=0, align=64, offset_factor=1) + B_1 = T.match_buffer(B, [1024, 1024], elem_offset=0, align=64, offset_factor=1) + C_1 = T.match_buffer(C, [1024 * 1024], elem_offset=0, align=64, offset_factor=1) # body packedB = T.allocate([32768], "float32", "global") for x in T.parallel(0, 32): @@ -484,10 +484,10 @@ def func(A: T.handle, W: T.handle, Conv: T.handle) -> None: tz = T.env_thread("threadIdx.z") # buffer definition Apad_shared = T.buffer_decl( - [16, 16, 16, 16, 16, 16], dtype="float16", elem_offset=0, align=128, offset_factor=1 + [16, 16, 16, 16, 16, 16], dtype="float16", elem_offset=0, align=64, offset_factor=1 ) Apad_shared_wmma_matrix_a = T.buffer_decl( - [16, 16, 16, 16, 16, 16], dtype="float16", elem_offset=0, align=128, offset_factor=1 + [16, 16, 16, 16, 16, 16], dtype="float16", elem_offset=0, align=64, offset_factor=1 ) BA = T.buffer_decl( [16, 16], dtype="float16", scope="wmma.matrix_a", align=32, offset_factor=256 @@ -497,13 +497,13 @@ def func(A: T.handle, W: T.handle, Conv: T.handle) -> None: ) BC = T.buffer_decl([16, 16], scope="wmma.accumulator", align=32, offset_factor=256) Conv_wmma_accumulator = T.buffer_decl( - [16, 14, 14, 32, 16, 16], elem_offset=0, align=128, offset_factor=1 + [16, 14, 14, 32, 16, 16], elem_offset=0, align=64, offset_factor=1 ) W_shared = T.buffer_decl( - [3, 3, 16, 32, 16, 16], dtype="float16", elem_offset=0, align=128, offset_factor=1 + [3, 3, 16, 32, 16, 16], dtype="float16", elem_offset=0, align=64, offset_factor=1 ) W_shared_wmma_matrix_b = T.buffer_decl( - [3, 3, 16, 32, 16, 16], dtype="float16", elem_offset=0, align=128, offset_factor=1 + [3, 3, 16, 32, 16, 16], dtype="float16", elem_offset=0, align=64, offset_factor=1 ) buffer = T.buffer_decl( [16, 16], dtype="float16", scope="shared", align=32, offset_factor=256 @@ -520,13 +520,13 @@ def func(A: T.handle, W: T.handle, Conv: T.handle) -> None: buffer_4 = T.buffer_decl([16, 16], scope="wmma.accumulator", align=32, offset_factor=256) buffer_5 = T.buffer_decl([16, 16], align=32, offset_factor=256) A_1 = T.match_buffer( - A, [16, 14, 14, 16, 16, 16], dtype="float16", elem_offset=0, align=128, offset_factor=1 + A, [16, 14, 14, 16, 16, 16], dtype="float16", elem_offset=0, align=64, offset_factor=1 ) W_1 = T.match_buffer( - W, [3, 3, 16, 32, 16, 16], dtype="float16", elem_offset=0, align=128, offset_factor=1 + W, [3, 3, 16, 32, 16, 16], dtype="float16", elem_offset=0, align=64, offset_factor=1 ) Conv_1 = T.match_buffer( - Conv, [16, 14, 14, 32, 16, 16], elem_offset=0, align=128, offset_factor=1 + Conv, [16, 14, 14, 32, 16, 16], elem_offset=0, align=64, offset_factor=1 ) # body T.realize(Conv_1[0:16, 0:14, 0:14, 0:32, 0:16, 0:16], "") @@ -2958,8 +2958,8 @@ def primfunc_with_allocate_annotations(): def primfunc_with_allocate_annotations(placeholder_28: T.handle, T_cast_6: T.handle) -> None: # function attr dict T.func_attr({"global_symbol": "tvmgen_default_fused_nn_max_pool2d_cast", "tir.noalias": True}) - placeholder_29 = T.match_buffer(placeholder_28, [802816], dtype="uint8", elem_offset=0, align=128, offset_factor=1) - T_cast_7 = T.match_buffer(T_cast_6, [200704], dtype="int16", elem_offset=0, align=128, offset_factor=1) + placeholder_29 = T.match_buffer(placeholder_28, [802816], dtype="uint8", elem_offset=0, align=64, offset_factor=1) + T_cast_7 = T.match_buffer(T_cast_6, [200704], dtype="int16", elem_offset=0, align=64, offset_factor=1) # body tensor_2 = T.allocate([200704], "uint8", "global", annotations={"attr1_key": "attr1_value"}) for ax0_ax1_fused_4 in T.serial(0, 56): diff --git a/tests/python/unittest/test_tvmscript_syntax_sugar.py b/tests/python/unittest/test_tvmscript_syntax_sugar.py index 329a397724f3..d955ec0a8c80 100644 --- a/tests/python/unittest/test_tvmscript_syntax_sugar.py +++ b/tests/python/unittest/test_tvmscript_syntax_sugar.py @@ -288,9 +288,9 @@ def shared_16x16_to_ldmatrix_32x8_layout(i, j): @T.prim_func def mma_sync_m16n16k16_desc(a: T.handle, b: T.handle, c: T.handle) -> None: - A = T.match_buffer(a, (32, 8), "float16", align=128, offset_factor=16, scope="warp") - B = T.match_buffer(b, (32, 8), "float16", align=128, offset_factor=16, scope="warp") - C = T.match_buffer(c, (32, 8), "float16", align=128, offset_factor=16, scope="warp") + A = T.match_buffer(a, (32, 8), "float16", align=64, offset_factor=16, scope="warp") + B = T.match_buffer(b, (32, 8), "float16", align=64, offset_factor=16, scope="warp") + C = T.match_buffer(c, (32, 8), "float16", align=64, offset_factor=16, scope="warp") with T.block("root"): T.reads(C[0:32, 0:8], A[0:32, 0:8], B[0:32, 0:8]) @@ -315,9 +315,9 @@ def mma_sync_m16n16k16_desc(a: T.handle, b: T.handle, c: T.handle) -> None: @T.prim_func def mma_sync_m16n16k16_desc_manual(a: T.handle, b: T.handle, c: T.handle) -> None: - A = T.match_buffer(a, (32, 8), "float16", align=128, offset_factor=16, scope="warp") - B = T.match_buffer(b, (32, 8), "float16", align=128, offset_factor=16, scope="warp") - C = T.match_buffer(c, (32, 8), "float16", align=128, offset_factor=16, scope="warp") + A = T.match_buffer(a, (32, 8), "float16", align=64, offset_factor=16, scope="warp") + B = T.match_buffer(b, (32, 8), "float16", align=64, offset_factor=16, scope="warp") + C = T.match_buffer(c, (32, 8), "float16", align=64, offset_factor=16, scope="warp") with T.block("root"): T.reads(C[0:32, 0:8], A[0:32, 0:8], B[0:32, 0:8]) diff --git a/tests/python/unittest/test_tvmscript_type.py b/tests/python/unittest/test_tvmscript_type.py index 12954e31e5ec..8228363a95ac 100644 --- a/tests/python/unittest/test_tvmscript_type.py +++ b/tests/python/unittest/test_tvmscript_type.py @@ -25,13 +25,13 @@ @T.prim_func def element_wise_storage_align(a: T.handle, c: T.handle) -> None: - C = T.match_buffer(c, [128, 128], elem_offset=0, align=128, offset_factor=1) - A = T.match_buffer(a, [128, 128], elem_offset=0, align=128, offset_factor=1) + C = T.match_buffer(c, [128, 128], elem_offset=0, align=64, offset_factor=1) + A = T.match_buffer(a, [128, 128], elem_offset=0, align=64, offset_factor=1) # body with T.block("root"): T.reads([]) T.writes([]) - B = T.alloc_buffer([128, 128], elem_offset=0, align=128, offset_factor=1) + B = T.alloc_buffer([128, 128], elem_offset=0, align=64, offset_factor=1) for i0 in T.serial(0, 128): for ax1 in T.serial(0, 128): with T.block("B"): From 526544b4a0ad898fce1b8cc15a210ad862faf342 Mon Sep 17 00:00:00 2001 From: Josh Fromm Date: Tue, 23 Aug 2022 16:32:24 -0700 Subject: [PATCH 4/6] Revert mma alignment change. --- python/tvm/tir/schedule/schedule.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/python/tvm/tir/schedule/schedule.py b/python/tvm/tir/schedule/schedule.py index e3c2d651e632..e18bee35a5e1 100644 --- a/python/tvm/tir/schedule/schedule.py +++ b/python/tvm/tir/schedule/schedule.py @@ -2092,9 +2092,9 @@ def before_tensorize( @T.prim_func def mma_desc(a: T.handle, b: T.handle, c: T.handle) -> None: - A = T.match_buffer(a, (16, 16), align=64, offset_factor=1) - B = T.match_buffer(b, (16, 16), align=64, offset_factor=1) - C = T.match_buffer(c, (16, 16), align=64, offset_factor=1) + A = T.match_buffer(a, (16, 16), align=128, offset_factor=1) + B = T.match_buffer(b, (16, 16), align=128, offset_factor=1) + C = T.match_buffer(c, (16, 16), align=128, offset_factor=1) with T.block("root"): T.reads(C[0 : 16, 0 : 16], A[0 : 16, 0 : 16], B[0 : 16, 0 : 16]) @@ -2107,9 +2107,9 @@ def mma_desc(a: T.handle, b: T.handle, c: T.handle) -> None: @T.prim_func def mma_intrin(a: T.handle, b: T.handle, c: T.handle) -> None: - A = T.match_buffer(a, (16, 16), align=64, offset_factor=1) - B = T.match_buffer(b, (16, 16), align=64, offset_factor=1) - C = T.match_buffer(c, (16, 16), align=64, offset_factor=1) + A = T.match_buffer(a, (16, 16), align=128, offset_factor=1) + B = T.match_buffer(b, (16, 16), align=128, offset_factor=1) + C = T.match_buffer(c, (16, 16), align=128, offset_factor=1) with T.block("root"): T.reads(C[0 : 16, 0 : 16], A[0 : 16, 0 : 16], B[0 : 16, 0 : 16]) From a4b00ebc5972089e549173e9593fa9ba8bfc7f1c Mon Sep 17 00:00:00 2001 From: Josh Fromm Date: Tue, 23 Aug 2022 16:43:57 -0700 Subject: [PATCH 5/6] Change default printing of buffer. --- src/printer/tir_text_printer.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/printer/tir_text_printer.cc b/src/printer/tir_text_printer.cc index 894a9cec1e2a..cdfc8fd318fd 100644 --- a/src/printer/tir_text_printer.cc +++ b/src/printer/tir_text_printer.cc @@ -251,7 +251,7 @@ Doc TIRTextPrinter::BufferNode2Doc(const BufferNode* buf, Doc doc) { if (GetRef(buf).scope() != "global") { doc << ", scope=" << Doc::StrLiteral(GetRef(buf).scope()); } - if (buf->data_alignment != 128) { + if (buf->data_alignment != runtime::kAllocAlignment) { doc << ", align=" << buf->data_alignment; } if (buf->offset_factor != 1) { From 2c6019f8312410f05a5cfbadbb0fb8aee04afb02 Mon Sep 17 00:00:00 2001 From: Josh Fromm Date: Wed, 24 Aug 2022 16:20:01 -0700 Subject: [PATCH 6/6] Change crt runtime default allocation. --- src/runtime/crt/common/crt_runtime_api.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/runtime/crt/common/crt_runtime_api.c b/src/runtime/crt/common/crt_runtime_api.c index 2151c23f8462..7df610b53c45 100644 --- a/src/runtime/crt/common/crt_runtime_api.c +++ b/src/runtime/crt/common/crt_runtime_api.c @@ -104,7 +104,7 @@ int TVMDeviceAllocDataSpaceWithScope(DLDevice dev, int ndim, const int64_t* shap } nbytes *= (dtype.bits * dtype.lanes + 7) / 8; - int kAllocAlignment = 128; + int kAllocAlignment = 64; size_t align = (dtype.bits / 8) * dtype.lanes; if (align < kAllocAlignment) align = kAllocAlignment; return TVMDeviceAllocDataSpace(dev, nbytes, align, dtype, out_data);