From baee5d9a57f6154f5c68882f2fc0b7d01c149c6b Mon Sep 17 00:00:00 2001 From: Noah Verke Date: Thu, 5 Jan 2023 14:58:13 -0800 Subject: [PATCH 1/4] [Hexagon] Add hexagon user DMA intrins for tensorization. --- python/tvm/tir/tensor_intrin/hexagon.py | 123 ++++++++++++++---- .../test_hexagon/test_vtcm_bandwidth.py | 24 +++- 2 files changed, 119 insertions(+), 28 deletions(-) diff --git a/python/tvm/tir/tensor_intrin/hexagon.py b/python/tvm/tir/tensor_intrin/hexagon.py index 5e5749055bb0..0ab22e5d2daf 100644 --- a/python/tvm/tir/tensor_intrin/hexagon.py +++ b/python/tvm/tir/tensor_intrin/hexagon.py @@ -20,12 +20,53 @@ from .. import TensorIntrin -def generate_dot_product_32x4_u8u8i32(mem_scope="global"): +def generate_dma_load_intrin( + size: int, + dtype: str, +): + """Generator of dma_load intrins""" + + @T.prim_func + def dma_load_desc(a: T.handle, c: T.handle) -> None: + A = T.match_buffer(a, (size), dtype, offset_factor=1, scope="global") + C = T.match_buffer(c, (size), dtype, offset_factor=1, scope="global.vtcm") + with T.block("root"): + T.reads(A[0:size]) + T.writes(C[0:size]) + for i in T.serial(size): + with T.block("load"): + vii = T.axis.remap("S", [i]) + C[vii] = A[vii] + + @T.prim_func + def dma_load_impl(a: T.handle, c: T.handle) -> None: + A = T.match_buffer(a, (size), dtype, offset_factor=1, scope="global") + C = T.match_buffer(c, (size), dtype, offset_factor=1, scope="global.vtcm") + with T.block("root"): + T.reads(A[0:size]) + T.writes(C[0:size]) + T.evaluate( + T.tvm_call_packed( + "device_api.hexagon.dma_copy", + 0, + T.address_of(C[0], dtype="handle"), + T.address_of(A[0], dtype="handle"), + size, + 0, + dtype="int32", + ) + ) + T.evaluate(T.tvm_call_packed("device_api.hexagon.dma_wait", 0, 0, dtype="int32")) + + return dma_load_desc, dma_load_impl + + +def generate_dot_product_32x4_u8u8i32(mem_scopes={"reads": "global", "write": "global"}): @T.prim_func def dot_product_32x4_u8u8i32_desc(a: T.handle, b: T.handle, c: T.handle) -> None: - A = T.match_buffer(a, (4,), "uint8", offset_factor=1, scope=mem_scope) - B = T.match_buffer(b, (32, 4), "uint8", offset_factor=1, scope=mem_scope) - C = T.match_buffer(c, (32,), "int32", offset_factor=1, scope=mem_scope) + A = T.match_buffer(a, (4,), "uint8", offset_factor=1, scope=mem_scopes["reads"]) + B = T.match_buffer(b, (32, 4), "uint8", offset_factor=1, scope=mem_scopes["reads"]) + C = T.match_buffer(c, (32,), "int32", offset_factor=1, scope=mem_scopes["write"]) with T.block("root"): T.reads(C[0:32], A[0:4], B[0:32, 0:4]) T.writes(C[0:32]) @@ -37,9 +78,9 @@ def dot_product_32x4_u8u8i32_desc(a: T.handle, b: T.handle, c: T.handle) -> None @T.prim_func def dot_product_32x4_u8u8i32_vrmpy(a: T.handle, b: T.handle, c: T.handle) -> None: - A = T.match_buffer(a, (4,), "uint8", offset_factor=1, scope=mem_scope) - B = T.match_buffer(b, (32, 4), "uint8", offset_factor=1, scope=mem_scope) - C = T.match_buffer(c, (32,), "int32", offset_factor=1, scope=mem_scope) + A = T.match_buffer(a, (4,), "uint8", offset_factor=1, scope=mem_scopes["reads"]) + B = T.match_buffer(b, (32, 4), "uint8", offset_factor=1, scope=mem_scopes["reads"]) + C = T.match_buffer(c, (32,), "int32", offset_factor=1, scope=mem_scopes["write"]) with T.block("root"): T.reads(C[0:32], A[0:4], B[0:32, 0:4]) T.writes(C[0:32]) @@ -62,12 +103,12 @@ def dot_product_32x4_u8u8i32_vrmpy(a: T.handle, b: T.handle, c: T.handle) -> Non return dot_product_32x4_u8u8i32_desc, dot_product_32x4_u8u8i32_vrmpy -def generate_dot_product_32x4_u8i8i32(mem_scope="global"): +def generate_dot_product_32x4_u8i8i32(mem_scopes={"reads": "global", "write": "global"}): @T.prim_func def dot_product_32x4_u8i8i32_desc(a: T.handle, b: T.handle, c: T.handle) -> None: - A = T.match_buffer(a, (4,), "uint8", offset_factor=1, scope=mem_scope) - B = T.match_buffer(b, (32, 4), "int8", offset_factor=1, scope=mem_scope) - C = T.match_buffer(c, (32,), "int32", offset_factor=1, scope=mem_scope) + A = T.match_buffer(a, (4,), "uint8", offset_factor=1, scope=mem_scopes["reads"]) + B = T.match_buffer(b, (32, 4), "int8", offset_factor=1, scope=mem_scopes["reads"]) + C = T.match_buffer(c, (32,), "int32", offset_factor=1, scope=mem_scopes["write"]) with T.block("root"): T.reads(C[0:32], A[0:4], B[0:32, 0:4]) T.writes(C[0:32]) @@ -79,9 +120,9 @@ def dot_product_32x4_u8i8i32_desc(a: T.handle, b: T.handle, c: T.handle) -> None @T.prim_func def dot_product_32x4_u8i8i32_vrmpy(a: T.handle, b: T.handle, c: T.handle) -> None: - A = T.match_buffer(a, (4,), "uint8", offset_factor=1, scope=mem_scope) - B = T.match_buffer(b, (32, 4), "int8", offset_factor=1, scope=mem_scope) - C = T.match_buffer(c, (32,), "int32", offset_factor=1, scope=mem_scope) + A = T.match_buffer(a, (4,), "uint8", offset_factor=1, scope=mem_scopes["reads"]) + B = T.match_buffer(b, (32, 4), "int8", offset_factor=1, scope=mem_scopes["reads"]) + C = T.match_buffer(c, (32,), "int32", offset_factor=1, scope=mem_scopes["write"]) with T.block("root"): T.reads(C[0:32], A[0:4], B[0:32, 0:4]) T.writes(C[0:32]) @@ -104,12 +145,12 @@ def dot_product_32x4_u8i8i32_vrmpy(a: T.handle, b: T.handle, c: T.handle) -> Non return dot_product_32x4_u8i8i32_desc, dot_product_32x4_u8i8i32_vrmpy -def generate_dot_product_32x2_i16i16i32(mem_scope="global"): +def generate_dot_product_32x2_i16i16i32(mem_scopes={"reads": "global", "write": "global"}): @T.prim_func def dot_product_32x2_i16i16i32_desc(a: T.handle, b: T.handle, c: T.handle) -> None: - A = T.match_buffer(a, (2,), "int16", offset_factor=1, scope=mem_scope) - B = T.match_buffer(b, (32, 2), "int16", offset_factor=1, scope=mem_scope) - C = T.match_buffer(c, (32,), "int32", offset_factor=1, scope=mem_scope) + A = T.match_buffer(a, (2,), "int16", offset_factor=1, scope=mem_scopes["reads"]) + B = T.match_buffer(b, (32, 2), "int16", offset_factor=1, scope=mem_scopes["reads"]) + C = T.match_buffer(c, (32,), "int32", offset_factor=1, scope=mem_scopes["write"]) with T.block("root"): T.reads(C[0:32], A[0:2], B[0:32, 0:2]) T.writes(C[0:32]) @@ -121,9 +162,9 @@ def dot_product_32x2_i16i16i32_desc(a: T.handle, b: T.handle, c: T.handle) -> No @T.prim_func def dot_product_32x2_i16i16i32_vdmpy(a: T.handle, b: T.handle, c: T.handle) -> None: - A = T.match_buffer(a, (2,), "int16", offset_factor=1, scope=mem_scope) - B = T.match_buffer(b, (32, 2), "int16", offset_factor=1, scope=mem_scope) - C = T.match_buffer(c, (32,), "int32", offset_factor=1, scope=mem_scope) + A = T.match_buffer(a, (2,), "int16", offset_factor=1, scope=mem_scopes["reads"]) + B = T.match_buffer(b, (32, 2), "int16", offset_factor=1, scope=mem_scopes["reads"]) + C = T.match_buffer(c, (32,), "int32", offset_factor=1, scope=mem_scopes["write"]) with T.block("root"): T.reads(C[0:32], A[0:2], B[0:32, 0:2]) T.writes(C[0:32]) @@ -159,7 +200,43 @@ def dot_product_32x2_i16i16i32_vdmpy(a: T.handle, b: T.handle, c: T.handle) -> N TensorIntrin.register(VDMPY_i16i16i32_INTRIN, *generate_dot_product_32x2_i16i16i32()) VRMPY_u8u8i32_VTCM_INTRIN = "dot_32x4_u8u8i32_vtcm_vrmpy" -TensorIntrin.register(VRMPY_u8u8i32_VTCM_INTRIN, *generate_dot_product_32x4_u8u8i32("global.vtcm")) +TensorIntrin.register( + VRMPY_u8u8i32_VTCM_INTRIN, + *generate_dot_product_32x4_u8u8i32({"reads": "global.vtcm", "write": "global.vtcm"}), +) + +VRMPY_u8u8i32_VTCM_READS_INTRIN = "dot_32x4_u8u8i32_vtcm_reads_vrmpy" +TensorIntrin.register( + VRMPY_u8u8i32_VTCM_READS_INTRIN, + *generate_dot_product_32x4_u8u8i32({"reads": "global.vtcm", "write": "global"}), +) VRMPY_u8i8i32_VTCM_INTRIN = "dot_32x4_u8i8i32_vtcm_vrmpy" -TensorIntrin.register(VRMPY_u8i8i32_VTCM_INTRIN, *generate_dot_product_32x4_u8i8i32("global.vtcm")) +TensorIntrin.register( + VRMPY_u8i8i32_VTCM_INTRIN, + *generate_dot_product_32x4_u8i8i32({"reads": "global.vtcm", "write": "global.vtcm"}), +) + +DMA_READ_1_u8 = "dma_read_1_u8" +TensorIntrin.register(DMA_READ_1_u8, *generate_dma_load_intrin(1, "uint8")) + +DMA_READ_1_i8 = "dma_read_1_i8" +TensorIntrin.register(DMA_READ_1_i8, *generate_dma_load_intrin(1, "int8")) + +DMA_READ_128_u8 = "dma_read_128_u8" +TensorIntrin.register(DMA_READ_128_u8, *generate_dma_load_intrin(128, "uint8")) + +DMA_READ_128_i8 = "dma_read_128_i8" +TensorIntrin.register(DMA_READ_128_i8, *generate_dma_load_intrin(128, "int8")) + +DMA_READ_1024_u8 = "dma_read_1024_u8" +TensorIntrin.register(DMA_READ_1024_u8, *generate_dma_load_intrin(1024, "uint8")) + +DMA_READ_1024_i8 = "dma_read_1024_i8" +TensorIntrin.register(DMA_READ_1024_i8, *generate_dma_load_intrin(1024, "int8")) + +DMA_READ_4096_u8 = "dma_read_4096_u8" +TensorIntrin.register(DMA_READ_4096_u8, *generate_dma_load_intrin(4096, "uint8")) + +DMA_READ_4096_i8 = "dma_read_4096_i8" +TensorIntrin.register(DMA_READ_4096_i8, *generate_dma_load_intrin(4096, "int8")) diff --git a/tests/python/contrib/test_hexagon/test_vtcm_bandwidth.py b/tests/python/contrib/test_hexagon/test_vtcm_bandwidth.py index 254eb00cb2ea..97a4bdeca0f2 100644 --- a/tests/python/contrib/test_hexagon/test_vtcm_bandwidth.py +++ b/tests/python/contrib/test_hexagon/test_vtcm_bandwidth.py @@ -21,6 +21,7 @@ import tvm from tvm.script import tir as T +from tvm.tir.tensor_intrin.hexagon import DMA_READ_128_i8 from .infrastructure import get_hexagon_target @@ -30,6 +31,7 @@ "Test bandwidth with buffer size {}MB... \n" " -Base: {} GBps \n -Vectorized: {} GBps\n" " -Vectorized and Parallelized: {} GBps\n" + " -Sync DMA: {} GBps\n" " -Single DMA Copy: {} GBps\n" ) @@ -104,8 +106,8 @@ def evaluate(hexagon_session, sch, size): ) # These are reduced for CI but number=100 and repeat=10 does a good job of removing noise. - number = 1 - repeat = 1 + number = 10 + repeat = 10 timer = module.time_evaluator( "__tvm_main__", hexagon_session.device, number=number, repeat=repeat @@ -123,7 +125,10 @@ class TestMatMulVec: # Removed most of these to speedup CI. size = tvm.testing.parameter( - # 10 * KB, + 128, + 256, + 1024, + 10 * KB, # 20 * KB, # 40 * KB, # 80 * KB, @@ -131,7 +136,7 @@ class TestMatMulVec: # 320 * KB, 640 * KB, # MB, - # 2 * MB, + 2 * MB, # 3 * MB, # 4 * MB, # 8 * MB, # Only works on 8gen1 HDKs @@ -169,6 +174,15 @@ def test_bandwidth(self, hexagon_session, size, outer_split, unroll_split, vecto sch.parallel(vbo_a) parallel_gbps = evaluate(hexagon_session, sch, size) + # Run with some basic unroll and vectorize scheduling and parallelization. + sch = tvm.tir.Schedule(memcopy_operator(size)) + block = sch.get_block("A_global.vtcm") + loops = sch.get_loops(block) + _, inner = sch.split(loops[0], [None, 128]) + sch.tensorize(inner, DMA_READ_128_i8) + # print(sch.mod.script()) + sync_dma_gbps = evaluate(hexagon_session, sch, size) + # Run using a single dma copy to transfer the data. sch = tvm.tir.Schedule(single_dma_operator(size)) single_dma_gbps = evaluate(hexagon_session, sch, size) @@ -176,7 +190,7 @@ def test_bandwidth(self, hexagon_session, size, outer_split, unroll_split, vecto mbs = round(size / MB, 2) print( TEST_OUTPUT_TEMPLATE.format( - mbs, base_gpbs, vectorize_gbps, parallel_gbps, single_dma_gbps + mbs, base_gpbs, vectorize_gbps, parallel_gbps, sync_dma_gbps, single_dma_gbps ) ) From fb97e624a2d4c5042832f456e4ee4b33a09876d8 Mon Sep 17 00:00:00 2001 From: Noah Verke Date: Fri, 6 Jan 2023 17:17:38 -0800 Subject: [PATCH 2/4] revert changes not needed --- python/tvm/tir/tensor_intrin/hexagon.py | 58 ++++++++++--------------- 1 file changed, 23 insertions(+), 35 deletions(-) diff --git a/python/tvm/tir/tensor_intrin/hexagon.py b/python/tvm/tir/tensor_intrin/hexagon.py index 0ab22e5d2daf..d4c636018272 100644 --- a/python/tvm/tir/tensor_intrin/hexagon.py +++ b/python/tvm/tir/tensor_intrin/hexagon.py @@ -61,12 +61,12 @@ def dma_load_impl(a: T.handle, c: T.handle) -> None: return dma_load_desc, dma_load_impl -def generate_dot_product_32x4_u8u8i32(mem_scopes={"reads": "global", "write": "global"}): +def generate_dot_product_32x4_u8u8i32(mem_scope="global"): @T.prim_func def dot_product_32x4_u8u8i32_desc(a: T.handle, b: T.handle, c: T.handle) -> None: - A = T.match_buffer(a, (4,), "uint8", offset_factor=1, scope=mem_scopes["reads"]) - B = T.match_buffer(b, (32, 4), "uint8", offset_factor=1, scope=mem_scopes["reads"]) - C = T.match_buffer(c, (32,), "int32", offset_factor=1, scope=mem_scopes["write"]) + A = T.match_buffer(a, (4,), "uint8", offset_factor=1, scope=mem_scope) + B = T.match_buffer(b, (32, 4), "uint8", offset_factor=1, scope=mem_scope) + C = T.match_buffer(c, (32,), "int32", offset_factor=1, scope=mem_scope) with T.block("root"): T.reads(C[0:32], A[0:4], B[0:32, 0:4]) T.writes(C[0:32]) @@ -78,9 +78,9 @@ def dot_product_32x4_u8u8i32_desc(a: T.handle, b: T.handle, c: T.handle) -> None @T.prim_func def dot_product_32x4_u8u8i32_vrmpy(a: T.handle, b: T.handle, c: T.handle) -> None: - A = T.match_buffer(a, (4,), "uint8", offset_factor=1, scope=mem_scopes["reads"]) - B = T.match_buffer(b, (32, 4), "uint8", offset_factor=1, scope=mem_scopes["reads"]) - C = T.match_buffer(c, (32,), "int32", offset_factor=1, scope=mem_scopes["write"]) + A = T.match_buffer(a, (4,), "uint8", offset_factor=1, scope=mem_scope) + B = T.match_buffer(b, (32, 4), "uint8", offset_factor=1, scope=mem_scope) + C = T.match_buffer(c, (32,), "int32", offset_factor=1, scope=mem_scope) with T.block("root"): T.reads(C[0:32], A[0:4], B[0:32, 0:4]) T.writes(C[0:32]) @@ -103,12 +103,12 @@ def dot_product_32x4_u8u8i32_vrmpy(a: T.handle, b: T.handle, c: T.handle) -> Non return dot_product_32x4_u8u8i32_desc, dot_product_32x4_u8u8i32_vrmpy -def generate_dot_product_32x4_u8i8i32(mem_scopes={"reads": "global", "write": "global"}): +def generate_dot_product_32x4_u8i8i32(mem_scope="global"): @T.prim_func def dot_product_32x4_u8i8i32_desc(a: T.handle, b: T.handle, c: T.handle) -> None: - A = T.match_buffer(a, (4,), "uint8", offset_factor=1, scope=mem_scopes["reads"]) - B = T.match_buffer(b, (32, 4), "int8", offset_factor=1, scope=mem_scopes["reads"]) - C = T.match_buffer(c, (32,), "int32", offset_factor=1, scope=mem_scopes["write"]) + A = T.match_buffer(a, (4,), "uint8", offset_factor=1, scope=mem_scope) + B = T.match_buffer(b, (32, 4), "int8", offset_factor=1, scope=mem_scope) + C = T.match_buffer(c, (32,), "int32", offset_factor=1, scope=mem_scope) with T.block("root"): T.reads(C[0:32], A[0:4], B[0:32, 0:4]) T.writes(C[0:32]) @@ -120,9 +120,9 @@ def dot_product_32x4_u8i8i32_desc(a: T.handle, b: T.handle, c: T.handle) -> None @T.prim_func def dot_product_32x4_u8i8i32_vrmpy(a: T.handle, b: T.handle, c: T.handle) -> None: - A = T.match_buffer(a, (4,), "uint8", offset_factor=1, scope=mem_scopes["reads"]) - B = T.match_buffer(b, (32, 4), "int8", offset_factor=1, scope=mem_scopes["reads"]) - C = T.match_buffer(c, (32,), "int32", offset_factor=1, scope=mem_scopes["write"]) + A = T.match_buffer(a, (4,), "uint8", offset_factor=1, scope=mem_scope) + B = T.match_buffer(b, (32, 4), "int8", offset_factor=1, scope=mem_scope) + C = T.match_buffer(c, (32,), "int32", offset_factor=1, scope=mem_scope) with T.block("root"): T.reads(C[0:32], A[0:4], B[0:32, 0:4]) T.writes(C[0:32]) @@ -145,12 +145,12 @@ def dot_product_32x4_u8i8i32_vrmpy(a: T.handle, b: T.handle, c: T.handle) -> Non return dot_product_32x4_u8i8i32_desc, dot_product_32x4_u8i8i32_vrmpy -def generate_dot_product_32x2_i16i16i32(mem_scopes={"reads": "global", "write": "global"}): +def generate_dot_product_32x2_i16i16i32(mem_scope="global"): @T.prim_func def dot_product_32x2_i16i16i32_desc(a: T.handle, b: T.handle, c: T.handle) -> None: - A = T.match_buffer(a, (2,), "int16", offset_factor=1, scope=mem_scopes["reads"]) - B = T.match_buffer(b, (32, 2), "int16", offset_factor=1, scope=mem_scopes["reads"]) - C = T.match_buffer(c, (32,), "int32", offset_factor=1, scope=mem_scopes["write"]) + A = T.match_buffer(a, (2,), "int16", offset_factor=1, scope=mem_scope) + B = T.match_buffer(b, (32, 2), "int16", offset_factor=1, scope=mem_scope) + C = T.match_buffer(c, (32,), "int32", offset_factor=1, scope=mem_scope) with T.block("root"): T.reads(C[0:32], A[0:2], B[0:32, 0:2]) T.writes(C[0:32]) @@ -162,9 +162,9 @@ def dot_product_32x2_i16i16i32_desc(a: T.handle, b: T.handle, c: T.handle) -> No @T.prim_func def dot_product_32x2_i16i16i32_vdmpy(a: T.handle, b: T.handle, c: T.handle) -> None: - A = T.match_buffer(a, (2,), "int16", offset_factor=1, scope=mem_scopes["reads"]) - B = T.match_buffer(b, (32, 2), "int16", offset_factor=1, scope=mem_scopes["reads"]) - C = T.match_buffer(c, (32,), "int32", offset_factor=1, scope=mem_scopes["write"]) + A = T.match_buffer(a, (2,), "int16", offset_factor=1, scope=mem_scope) + B = T.match_buffer(b, (32, 2), "int16", offset_factor=1, scope=mem_scope) + C = T.match_buffer(c, (32,), "int32", offset_factor=1, scope=mem_scope) with T.block("root"): T.reads(C[0:32], A[0:2], B[0:32, 0:2]) T.writes(C[0:32]) @@ -200,22 +200,10 @@ def dot_product_32x2_i16i16i32_vdmpy(a: T.handle, b: T.handle, c: T.handle) -> N TensorIntrin.register(VDMPY_i16i16i32_INTRIN, *generate_dot_product_32x2_i16i16i32()) VRMPY_u8u8i32_VTCM_INTRIN = "dot_32x4_u8u8i32_vtcm_vrmpy" -TensorIntrin.register( - VRMPY_u8u8i32_VTCM_INTRIN, - *generate_dot_product_32x4_u8u8i32({"reads": "global.vtcm", "write": "global.vtcm"}), -) - -VRMPY_u8u8i32_VTCM_READS_INTRIN = "dot_32x4_u8u8i32_vtcm_reads_vrmpy" -TensorIntrin.register( - VRMPY_u8u8i32_VTCM_READS_INTRIN, - *generate_dot_product_32x4_u8u8i32({"reads": "global.vtcm", "write": "global"}), -) +TensorIntrin.register(VRMPY_u8u8i32_VTCM_INTRIN, *generate_dot_product_32x4_u8u8i32("global.vtcm")) VRMPY_u8i8i32_VTCM_INTRIN = "dot_32x4_u8i8i32_vtcm_vrmpy" -TensorIntrin.register( - VRMPY_u8i8i32_VTCM_INTRIN, - *generate_dot_product_32x4_u8i8i32({"reads": "global.vtcm", "write": "global.vtcm"}), -) +TensorIntrin.register(VRMPY_u8i8i32_VTCM_INTRIN, *generate_dot_product_32x4_u8i8i32("global.vtcm")) DMA_READ_1_u8 = "dma_read_1_u8" TensorIntrin.register(DMA_READ_1_u8, *generate_dma_load_intrin(1, "uint8")) From a04de7c96c5ac19c1578a710a730f56bf1cd4783 Mon Sep 17 00:00:00 2001 From: Noah Verke Date: Fri, 13 Jan 2023 16:12:57 -0800 Subject: [PATCH 3/4] Add comments, change queue id to -1, and remove tests from CI --- python/tvm/tir/tensor_intrin/hexagon.py | 37 +++++++------------ .../test_hexagon/test_vtcm_bandwidth.py | 32 +++++++--------- 2 files changed, 26 insertions(+), 43 deletions(-) diff --git a/python/tvm/tir/tensor_intrin/hexagon.py b/python/tvm/tir/tensor_intrin/hexagon.py index d4c636018272..d5de862fc91d 100644 --- a/python/tvm/tir/tensor_intrin/hexagon.py +++ b/python/tvm/tir/tensor_intrin/hexagon.py @@ -27,7 +27,7 @@ def generate_dma_load_intrin( """Generator of dma_load intrins""" @T.prim_func - def dma_load_desc(a: T.handle, c: T.handle) -> None: + def sync_dma_load_desc(a: T.handle, c: T.handle) -> None: A = T.match_buffer(a, (size), dtype, offset_factor=1, scope="global") C = T.match_buffer(c, (size), dtype, offset_factor=1, scope="global.vtcm") with T.block("root"): @@ -39,7 +39,7 @@ def dma_load_desc(a: T.handle, c: T.handle) -> None: C[vii] = A[vii] @T.prim_func - def dma_load_impl(a: T.handle, c: T.handle) -> None: + def sync_dma_load_impl(a: T.handle, c: T.handle) -> None: A = T.match_buffer(a, (size), dtype, offset_factor=1, scope="global") C = T.match_buffer(c, (size), dtype, offset_factor=1, scope="global.vtcm") with T.block("root"): @@ -48,17 +48,24 @@ def dma_load_impl(a: T.handle, c: T.handle) -> None: T.evaluate( T.tvm_call_packed( "device_api.hexagon.dma_copy", - 0, + -1, #Use QueueId of -1 to not interfere with async copies. T.address_of(C[0], dtype="handle"), T.address_of(A[0], dtype="handle"), size, - 0, + 0, #Do not use experimental bypass mode. dtype="int32", ) ) - T.evaluate(T.tvm_call_packed("device_api.hexagon.dma_wait", 0, 0, dtype="int32")) + T.evaluate( + T.tvm_call_packed( + "device_api.hexagon.dma_wait", + -1, + 0, #Wait for the sync queue (-1) to have 0 messages. + dtype="int32" + ) + ) - return dma_load_desc, dma_load_impl + return sync_dma_load_desc, sync_dma_load_impl def generate_dot_product_32x4_u8u8i32(mem_scope="global"): @@ -205,26 +212,8 @@ def dot_product_32x2_i16i16i32_vdmpy(a: T.handle, b: T.handle, c: T.handle) -> N VRMPY_u8i8i32_VTCM_INTRIN = "dot_32x4_u8i8i32_vtcm_vrmpy" TensorIntrin.register(VRMPY_u8i8i32_VTCM_INTRIN, *generate_dot_product_32x4_u8i8i32("global.vtcm")) -DMA_READ_1_u8 = "dma_read_1_u8" -TensorIntrin.register(DMA_READ_1_u8, *generate_dma_load_intrin(1, "uint8")) - -DMA_READ_1_i8 = "dma_read_1_i8" -TensorIntrin.register(DMA_READ_1_i8, *generate_dma_load_intrin(1, "int8")) - DMA_READ_128_u8 = "dma_read_128_u8" TensorIntrin.register(DMA_READ_128_u8, *generate_dma_load_intrin(128, "uint8")) DMA_READ_128_i8 = "dma_read_128_i8" TensorIntrin.register(DMA_READ_128_i8, *generate_dma_load_intrin(128, "int8")) - -DMA_READ_1024_u8 = "dma_read_1024_u8" -TensorIntrin.register(DMA_READ_1024_u8, *generate_dma_load_intrin(1024, "uint8")) - -DMA_READ_1024_i8 = "dma_read_1024_i8" -TensorIntrin.register(DMA_READ_1024_i8, *generate_dma_load_intrin(1024, "int8")) - -DMA_READ_4096_u8 = "dma_read_4096_u8" -TensorIntrin.register(DMA_READ_4096_u8, *generate_dma_load_intrin(4096, "uint8")) - -DMA_READ_4096_i8 = "dma_read_4096_i8" -TensorIntrin.register(DMA_READ_4096_i8, *generate_dma_load_intrin(4096, "int8")) diff --git a/tests/python/contrib/test_hexagon/test_vtcm_bandwidth.py b/tests/python/contrib/test_hexagon/test_vtcm_bandwidth.py index 97a4bdeca0f2..91876bb1e374 100644 --- a/tests/python/contrib/test_hexagon/test_vtcm_bandwidth.py +++ b/tests/python/contrib/test_hexagon/test_vtcm_bandwidth.py @@ -20,6 +20,7 @@ import numpy as np import tvm +import pytest from tvm.script import tir as T from tvm.tir.tensor_intrin.hexagon import DMA_READ_128_i8 @@ -105,13 +106,12 @@ def evaluate(hexagon_session, sch, size): a_vtcm, device=hexagon_session.device, mem_scope="global.vtcm" ) - # These are reduced for CI but number=100 and repeat=10 does a good job of removing noise. - number = 10 - repeat = 10 + if tvm.testing.utils.IS_IN_CI: + # Run with reduced number and repeat for CI + timer = module.time_evaluator("__tvm_main__", hexagon_session.device, number=1, repeat=1) + else: + timer = module.time_evaluator("__tvm_main__", hexagon_session.device, number=10, repeat=10) - timer = module.time_evaluator( - "__tvm_main__", hexagon_session.device, number=number, repeat=repeat - ) runtime = timer(a_hexagon, a_vtcm_hexagon) gbps = round((size / 2**30) / runtime.mean, 4) @@ -126,20 +126,10 @@ class TestMatMulVec: # Removed most of these to speedup CI. size = tvm.testing.parameter( 128, - 256, - 1024, + KB, 10 * KB, - # 20 * KB, - # 40 * KB, - # 80 * KB, - # 160 * KB, - # 320 * KB, - 640 * KB, - # MB, - 2 * MB, - # 3 * MB, - # 4 * MB, - # 8 * MB, # Only works on 8gen1 HDKs + 100 * KB, + MB, ) outer_split = tvm.testing.parameter(4) @@ -149,6 +139,10 @@ class TestMatMulVec: @tvm.testing.requires_hexagon def test_bandwidth(self, hexagon_session, size, outer_split, unroll_split, vector_split): """Test bandwidth.""" + + if tvm.testing.utils.IS_IN_CI and (size > 128): + pytest.skip("Skipping test since it takes too long in CI.") + # Run the base memcopy operator. sch = tvm.tir.Schedule(memcopy_operator(size)) base_gpbs = evaluate(hexagon_session, sch, size) From 51641c68d15c274d650faf52195349dd5b7ee1dc Mon Sep 17 00:00:00 2001 From: Noah Verke Date: Fri, 13 Jan 2023 16:13:22 -0800 Subject: [PATCH 4/4] lint changes --- python/tvm/tir/tensor_intrin/hexagon.py | 10 +++++----- .../python/contrib/test_hexagon/test_vtcm_bandwidth.py | 4 ++-- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/python/tvm/tir/tensor_intrin/hexagon.py b/python/tvm/tir/tensor_intrin/hexagon.py index d5de862fc91d..7a348f3f1a45 100644 --- a/python/tvm/tir/tensor_intrin/hexagon.py +++ b/python/tvm/tir/tensor_intrin/hexagon.py @@ -48,20 +48,20 @@ def sync_dma_load_impl(a: T.handle, c: T.handle) -> None: T.evaluate( T.tvm_call_packed( "device_api.hexagon.dma_copy", - -1, #Use QueueId of -1 to not interfere with async copies. + -1, # Use QueueId of -1 to not interfere with async copies. T.address_of(C[0], dtype="handle"), T.address_of(A[0], dtype="handle"), size, - 0, #Do not use experimental bypass mode. + 0, # Do not use experimental bypass mode. dtype="int32", ) ) T.evaluate( T.tvm_call_packed( "device_api.hexagon.dma_wait", - -1, - 0, #Wait for the sync queue (-1) to have 0 messages. - dtype="int32" + -1, + 0, # Wait for the sync queue (-1) to have 0 messages. + dtype="int32", ) ) diff --git a/tests/python/contrib/test_hexagon/test_vtcm_bandwidth.py b/tests/python/contrib/test_hexagon/test_vtcm_bandwidth.py index 91876bb1e374..53d0428a5ad1 100644 --- a/tests/python/contrib/test_hexagon/test_vtcm_bandwidth.py +++ b/tests/python/contrib/test_hexagon/test_vtcm_bandwidth.py @@ -139,10 +139,10 @@ class TestMatMulVec: @tvm.testing.requires_hexagon def test_bandwidth(self, hexagon_session, size, outer_split, unroll_split, vector_split): """Test bandwidth.""" - + if tvm.testing.utils.IS_IN_CI and (size > 128): pytest.skip("Skipping test since it takes too long in CI.") - + # Run the base memcopy operator. sch = tvm.tir.Schedule(memcopy_operator(size)) base_gpbs = evaluate(hexagon_session, sch, size)