From baee5d9a57f6154f5c68882f2fc0b7d01c149c6b Mon Sep 17 00:00:00 2001
From: Noah Verke <nverke@octoml.ai>
Date: Thu, 5 Jan 2023 14:58:13 -0800
Subject: [PATCH 1/4] [Hexagon] Add hexagon user DMA intrins for tensorization.

---
 python/tvm/tir/tensor_intrin/hexagon.py       | 123 ++++++++++++++----
 .../test_hexagon/test_vtcm_bandwidth.py       |  24 +++-
 2 files changed, 119 insertions(+), 28 deletions(-)

diff --git a/python/tvm/tir/tensor_intrin/hexagon.py b/python/tvm/tir/tensor_intrin/hexagon.py
index 5e5749055bb0..0ab22e5d2daf 100644
--- a/python/tvm/tir/tensor_intrin/hexagon.py
+++ b/python/tvm/tir/tensor_intrin/hexagon.py
@@ -20,12 +20,53 @@
 from .. import TensorIntrin
 
 
-def generate_dot_product_32x4_u8u8i32(mem_scope="global"):
+def generate_dma_load_intrin(
+    size: int,
+    dtype: str,
+):
+    """Generator of dma_load intrins"""
+
+    @T.prim_func
+    def dma_load_desc(a: T.handle, c: T.handle) -> None:
+        A = T.match_buffer(a, (size), dtype, offset_factor=1, scope="global")
+        C = T.match_buffer(c, (size), dtype, offset_factor=1, scope="global.vtcm")
+        with T.block("root"):
+            T.reads(A[0:size])
+            T.writes(C[0:size])
+            for i in T.serial(size):
+                with T.block("load"):
+                    vii = T.axis.remap("S", [i])
+                    C[vii] = A[vii]
+
+    @T.prim_func
+    def dma_load_impl(a: T.handle, c: T.handle) -> None:
+        A = T.match_buffer(a, (size), dtype, offset_factor=1, scope="global")
+        C = T.match_buffer(c, (size), dtype, offset_factor=1, scope="global.vtcm")
+        with T.block("root"):
+            T.reads(A[0:size])
+            T.writes(C[0:size])
+            T.evaluate(
+                T.tvm_call_packed(
+                    "device_api.hexagon.dma_copy",
+                    0,
+                    T.address_of(C[0], dtype="handle"),
+                    T.address_of(A[0], dtype="handle"),
+                    size,
+                    0,
+                    dtype="int32",
+                )
+            )
+            T.evaluate(T.tvm_call_packed("device_api.hexagon.dma_wait", 0, 0, dtype="int32"))
+
+    return dma_load_desc, dma_load_impl
+
+
+def generate_dot_product_32x4_u8u8i32(mem_scopes={"reads": "global", "write": "global"}):
     @T.prim_func
     def dot_product_32x4_u8u8i32_desc(a: T.handle, b: T.handle, c: T.handle) -> None:
-        A = T.match_buffer(a, (4,), "uint8", offset_factor=1, scope=mem_scope)
-        B = T.match_buffer(b, (32, 4), "uint8", offset_factor=1, scope=mem_scope)
-        C = T.match_buffer(c, (32,), "int32", offset_factor=1, scope=mem_scope)
+        A = T.match_buffer(a, (4,), "uint8", offset_factor=1, scope=mem_scopes["reads"])
+        B = T.match_buffer(b, (32, 4), "uint8", offset_factor=1, scope=mem_scopes["reads"])
+        C = T.match_buffer(c, (32,), "int32", offset_factor=1, scope=mem_scopes["write"])
         with T.block("root"):
             T.reads(C[0:32], A[0:4], B[0:32, 0:4])
             T.writes(C[0:32])
@@ -37,9 +78,9 @@ def dot_product_32x4_u8u8i32_desc(a: T.handle, b: T.handle, c: T.handle) -> None
 
     @T.prim_func
     def dot_product_32x4_u8u8i32_vrmpy(a: T.handle, b: T.handle, c: T.handle) -> None:
-        A = T.match_buffer(a, (4,), "uint8", offset_factor=1, scope=mem_scope)
-        B = T.match_buffer(b, (32, 4), "uint8", offset_factor=1, scope=mem_scope)
-        C = T.match_buffer(c, (32,), "int32", offset_factor=1, scope=mem_scope)
+        A = T.match_buffer(a, (4,), "uint8", offset_factor=1, scope=mem_scopes["reads"])
+        B = T.match_buffer(b, (32, 4), "uint8", offset_factor=1, scope=mem_scopes["reads"])
+        C = T.match_buffer(c, (32,), "int32", offset_factor=1, scope=mem_scopes["write"])
         with T.block("root"):
             T.reads(C[0:32], A[0:4], B[0:32, 0:4])
             T.writes(C[0:32])
@@ -62,12 +103,12 @@ def dot_product_32x4_u8u8i32_vrmpy(a: T.handle, b: T.handle, c: T.handle) -> Non
     return dot_product_32x4_u8u8i32_desc, dot_product_32x4_u8u8i32_vrmpy
 
 
-def generate_dot_product_32x4_u8i8i32(mem_scope="global"):
+def generate_dot_product_32x4_u8i8i32(mem_scopes={"reads": "global", "write": "global"}):
     @T.prim_func
     def dot_product_32x4_u8i8i32_desc(a: T.handle, b: T.handle, c: T.handle) -> None:
-        A = T.match_buffer(a, (4,), "uint8", offset_factor=1, scope=mem_scope)
-        B = T.match_buffer(b, (32, 4), "int8", offset_factor=1, scope=mem_scope)
-        C = T.match_buffer(c, (32,), "int32", offset_factor=1, scope=mem_scope)
+        A = T.match_buffer(a, (4,), "uint8", offset_factor=1, scope=mem_scopes["reads"])
+        B = T.match_buffer(b, (32, 4), "int8", offset_factor=1, scope=mem_scopes["reads"])
+        C = T.match_buffer(c, (32,), "int32", offset_factor=1, scope=mem_scopes["write"])
         with T.block("root"):
             T.reads(C[0:32], A[0:4], B[0:32, 0:4])
             T.writes(C[0:32])
@@ -79,9 +120,9 @@ def dot_product_32x4_u8i8i32_desc(a: T.handle, b: T.handle, c: T.handle) -> None
 
     @T.prim_func
     def dot_product_32x4_u8i8i32_vrmpy(a: T.handle, b: T.handle, c: T.handle) -> None:
-        A = T.match_buffer(a, (4,), "uint8", offset_factor=1, scope=mem_scope)
-        B = T.match_buffer(b, (32, 4), "int8", offset_factor=1, scope=mem_scope)
-        C = T.match_buffer(c, (32,), "int32", offset_factor=1, scope=mem_scope)
+        A = T.match_buffer(a, (4,), "uint8", offset_factor=1, scope=mem_scopes["reads"])
+        B = T.match_buffer(b, (32, 4), "int8", offset_factor=1, scope=mem_scopes["reads"])
+        C = T.match_buffer(c, (32,), "int32", offset_factor=1, scope=mem_scopes["write"])
         with T.block("root"):
             T.reads(C[0:32], A[0:4], B[0:32, 0:4])
             T.writes(C[0:32])
@@ -104,12 +145,12 @@ def dot_product_32x4_u8i8i32_vrmpy(a: T.handle, b: T.handle, c: T.handle) -> Non
     return dot_product_32x4_u8i8i32_desc, dot_product_32x4_u8i8i32_vrmpy
 
 
-def generate_dot_product_32x2_i16i16i32(mem_scope="global"):
+def generate_dot_product_32x2_i16i16i32(mem_scopes={"reads": "global", "write": "global"}):
     @T.prim_func
     def dot_product_32x2_i16i16i32_desc(a: T.handle, b: T.handle, c: T.handle) -> None:
-        A = T.match_buffer(a, (2,), "int16", offset_factor=1, scope=mem_scope)
-        B = T.match_buffer(b, (32, 2), "int16", offset_factor=1, scope=mem_scope)
-        C = T.match_buffer(c, (32,), "int32", offset_factor=1, scope=mem_scope)
+        A = T.match_buffer(a, (2,), "int16", offset_factor=1, scope=mem_scopes["reads"])
+        B = T.match_buffer(b, (32, 2), "int16", offset_factor=1, scope=mem_scopes["reads"])
+        C = T.match_buffer(c, (32,), "int32", offset_factor=1, scope=mem_scopes["write"])
         with T.block("root"):
             T.reads(C[0:32], A[0:2], B[0:32, 0:2])
             T.writes(C[0:32])
@@ -121,9 +162,9 @@ def dot_product_32x2_i16i16i32_desc(a: T.handle, b: T.handle, c: T.handle) -> No
 
     @T.prim_func
     def dot_product_32x2_i16i16i32_vdmpy(a: T.handle, b: T.handle, c: T.handle) -> None:
-        A = T.match_buffer(a, (2,), "int16", offset_factor=1, scope=mem_scope)
-        B = T.match_buffer(b, (32, 2), "int16", offset_factor=1, scope=mem_scope)
-        C = T.match_buffer(c, (32,), "int32", offset_factor=1, scope=mem_scope)
+        A = T.match_buffer(a, (2,), "int16", offset_factor=1, scope=mem_scopes["reads"])
+        B = T.match_buffer(b, (32, 2), "int16", offset_factor=1, scope=mem_scopes["reads"])
+        C = T.match_buffer(c, (32,), "int32", offset_factor=1, scope=mem_scopes["write"])
         with T.block("root"):
             T.reads(C[0:32], A[0:2], B[0:32, 0:2])
             T.writes(C[0:32])
@@ -159,7 +200,43 @@ def dot_product_32x2_i16i16i32_vdmpy(a: T.handle, b: T.handle, c: T.handle) -> N
 TensorIntrin.register(VDMPY_i16i16i32_INTRIN, *generate_dot_product_32x2_i16i16i32())
 
 VRMPY_u8u8i32_VTCM_INTRIN = "dot_32x4_u8u8i32_vtcm_vrmpy"
-TensorIntrin.register(VRMPY_u8u8i32_VTCM_INTRIN, *generate_dot_product_32x4_u8u8i32("global.vtcm"))
+TensorIntrin.register(
+    VRMPY_u8u8i32_VTCM_INTRIN,
+    *generate_dot_product_32x4_u8u8i32({"reads": "global.vtcm", "write": "global.vtcm"}),
+)
+
+VRMPY_u8u8i32_VTCM_READS_INTRIN = "dot_32x4_u8u8i32_vtcm_reads_vrmpy"
+TensorIntrin.register(
+    VRMPY_u8u8i32_VTCM_READS_INTRIN,
+    *generate_dot_product_32x4_u8u8i32({"reads": "global.vtcm", "write": "global"}),
+)
 
 VRMPY_u8i8i32_VTCM_INTRIN = "dot_32x4_u8i8i32_vtcm_vrmpy"
-TensorIntrin.register(VRMPY_u8i8i32_VTCM_INTRIN, *generate_dot_product_32x4_u8i8i32("global.vtcm"))
+TensorIntrin.register(
+    VRMPY_u8i8i32_VTCM_INTRIN,
+    *generate_dot_product_32x4_u8i8i32({"reads": "global.vtcm", "write": "global.vtcm"}),
+)
+
+DMA_READ_1_u8 = "dma_read_1_u8"
+TensorIntrin.register(DMA_READ_1_u8, *generate_dma_load_intrin(1, "uint8"))
+
+DMA_READ_1_i8 = "dma_read_1_i8"
+TensorIntrin.register(DMA_READ_1_i8, *generate_dma_load_intrin(1, "int8"))
+
+DMA_READ_128_u8 = "dma_read_128_u8"
+TensorIntrin.register(DMA_READ_128_u8, *generate_dma_load_intrin(128, "uint8"))
+
+DMA_READ_128_i8 = "dma_read_128_i8"
+TensorIntrin.register(DMA_READ_128_i8, *generate_dma_load_intrin(128, "int8"))
+
+DMA_READ_1024_u8 = "dma_read_1024_u8"
+TensorIntrin.register(DMA_READ_1024_u8, *generate_dma_load_intrin(1024, "uint8"))
+
+DMA_READ_1024_i8 = "dma_read_1024_i8"
+TensorIntrin.register(DMA_READ_1024_i8, *generate_dma_load_intrin(1024, "int8"))
+
+DMA_READ_4096_u8 = "dma_read_4096_u8"
+TensorIntrin.register(DMA_READ_4096_u8, *generate_dma_load_intrin(4096, "uint8"))
+
+DMA_READ_4096_i8 = "dma_read_4096_i8"
+TensorIntrin.register(DMA_READ_4096_i8, *generate_dma_load_intrin(4096, "int8"))
diff --git a/tests/python/contrib/test_hexagon/test_vtcm_bandwidth.py b/tests/python/contrib/test_hexagon/test_vtcm_bandwidth.py
index 254eb00cb2ea..97a4bdeca0f2 100644
--- a/tests/python/contrib/test_hexagon/test_vtcm_bandwidth.py
+++ b/tests/python/contrib/test_hexagon/test_vtcm_bandwidth.py
@@ -21,6 +21,7 @@
 
 import tvm
 from tvm.script import tir as T
+from tvm.tir.tensor_intrin.hexagon import DMA_READ_128_i8
 
 from .infrastructure import get_hexagon_target
 
@@ -30,6 +31,7 @@
     "Test bandwidth with buffer size {}MB... \n"
     "    -Base: {} GBps \n    -Vectorized: {} GBps\n"
     "    -Vectorized and Parallelized: {} GBps\n"
+    "    -Sync DMA: {} GBps\n"
     "    -Single DMA Copy: {} GBps\n"
 )
 
@@ -104,8 +106,8 @@ def evaluate(hexagon_session, sch, size):
     )
 
     # These are reduced for CI but number=100 and repeat=10 does a good job of removing noise.
-    number = 1
-    repeat = 1
+    number = 10
+    repeat = 10
 
     timer = module.time_evaluator(
         "__tvm_main__", hexagon_session.device, number=number, repeat=repeat
@@ -123,7 +125,10 @@ class TestMatMulVec:
 
     # Removed most of these to speedup CI.
     size = tvm.testing.parameter(
-        # 10 * KB,
+        128,
+        256,
+        1024,
+        10 * KB,
         # 20 * KB,
         # 40 * KB,
         # 80 * KB,
@@ -131,7 +136,7 @@ class TestMatMulVec:
         # 320 * KB,
         640 * KB,
         # MB,
-        # 2 * MB,
+        2 * MB,
         # 3 * MB,
         # 4 * MB,
         # 8 * MB,  # Only works on 8gen1 HDKs
@@ -169,6 +174,15 @@ def test_bandwidth(self, hexagon_session, size, outer_split, unroll_split, vecto
         sch.parallel(vbo_a)
         parallel_gbps = evaluate(hexagon_session, sch, size)
 
+        # Run with some basic unroll and vectorize scheduling and parallelization.
+        sch = tvm.tir.Schedule(memcopy_operator(size))
+        block = sch.get_block("A_global.vtcm")
+        loops = sch.get_loops(block)
+        _, inner = sch.split(loops[0], [None, 128])
+        sch.tensorize(inner, DMA_READ_128_i8)
+        # print(sch.mod.script())
+        sync_dma_gbps = evaluate(hexagon_session, sch, size)
+
         # Run using a single dma copy to transfer the data.
         sch = tvm.tir.Schedule(single_dma_operator(size))
         single_dma_gbps = evaluate(hexagon_session, sch, size)
@@ -176,7 +190,7 @@ def test_bandwidth(self, hexagon_session, size, outer_split, unroll_split, vecto
         mbs = round(size / MB, 2)
         print(
             TEST_OUTPUT_TEMPLATE.format(
-                mbs, base_gpbs, vectorize_gbps, parallel_gbps, single_dma_gbps
+                mbs, base_gpbs, vectorize_gbps, parallel_gbps, sync_dma_gbps, single_dma_gbps
             )
         )
 

From fb97e624a2d4c5042832f456e4ee4b33a09876d8 Mon Sep 17 00:00:00 2001
From: Noah Verke <nverke@octoml.ai>
Date: Fri, 6 Jan 2023 17:17:38 -0800
Subject: [PATCH 2/4] revert changes not needed

---
 python/tvm/tir/tensor_intrin/hexagon.py | 58 ++++++++++---------------
 1 file changed, 23 insertions(+), 35 deletions(-)

diff --git a/python/tvm/tir/tensor_intrin/hexagon.py b/python/tvm/tir/tensor_intrin/hexagon.py
index 0ab22e5d2daf..d4c636018272 100644
--- a/python/tvm/tir/tensor_intrin/hexagon.py
+++ b/python/tvm/tir/tensor_intrin/hexagon.py
@@ -61,12 +61,12 @@ def dma_load_impl(a: T.handle, c: T.handle) -> None:
     return dma_load_desc, dma_load_impl
 
 
-def generate_dot_product_32x4_u8u8i32(mem_scopes={"reads": "global", "write": "global"}):
+def generate_dot_product_32x4_u8u8i32(mem_scope="global"):
     @T.prim_func
     def dot_product_32x4_u8u8i32_desc(a: T.handle, b: T.handle, c: T.handle) -> None:
-        A = T.match_buffer(a, (4,), "uint8", offset_factor=1, scope=mem_scopes["reads"])
-        B = T.match_buffer(b, (32, 4), "uint8", offset_factor=1, scope=mem_scopes["reads"])
-        C = T.match_buffer(c, (32,), "int32", offset_factor=1, scope=mem_scopes["write"])
+        A = T.match_buffer(a, (4,), "uint8", offset_factor=1, scope=mem_scope)
+        B = T.match_buffer(b, (32, 4), "uint8", offset_factor=1, scope=mem_scope)
+        C = T.match_buffer(c, (32,), "int32", offset_factor=1, scope=mem_scope)
         with T.block("root"):
             T.reads(C[0:32], A[0:4], B[0:32, 0:4])
             T.writes(C[0:32])
@@ -78,9 +78,9 @@ def dot_product_32x4_u8u8i32_desc(a: T.handle, b: T.handle, c: T.handle) -> None
 
     @T.prim_func
     def dot_product_32x4_u8u8i32_vrmpy(a: T.handle, b: T.handle, c: T.handle) -> None:
-        A = T.match_buffer(a, (4,), "uint8", offset_factor=1, scope=mem_scopes["reads"])
-        B = T.match_buffer(b, (32, 4), "uint8", offset_factor=1, scope=mem_scopes["reads"])
-        C = T.match_buffer(c, (32,), "int32", offset_factor=1, scope=mem_scopes["write"])
+        A = T.match_buffer(a, (4,), "uint8", offset_factor=1, scope=mem_scope)
+        B = T.match_buffer(b, (32, 4), "uint8", offset_factor=1, scope=mem_scope)
+        C = T.match_buffer(c, (32,), "int32", offset_factor=1, scope=mem_scope)
         with T.block("root"):
             T.reads(C[0:32], A[0:4], B[0:32, 0:4])
             T.writes(C[0:32])
@@ -103,12 +103,12 @@ def dot_product_32x4_u8u8i32_vrmpy(a: T.handle, b: T.handle, c: T.handle) -> Non
     return dot_product_32x4_u8u8i32_desc, dot_product_32x4_u8u8i32_vrmpy
 
 
-def generate_dot_product_32x4_u8i8i32(mem_scopes={"reads": "global", "write": "global"}):
+def generate_dot_product_32x4_u8i8i32(mem_scope="global"):
     @T.prim_func
     def dot_product_32x4_u8i8i32_desc(a: T.handle, b: T.handle, c: T.handle) -> None:
-        A = T.match_buffer(a, (4,), "uint8", offset_factor=1, scope=mem_scopes["reads"])
-        B = T.match_buffer(b, (32, 4), "int8", offset_factor=1, scope=mem_scopes["reads"])
-        C = T.match_buffer(c, (32,), "int32", offset_factor=1, scope=mem_scopes["write"])
+        A = T.match_buffer(a, (4,), "uint8", offset_factor=1, scope=mem_scope)
+        B = T.match_buffer(b, (32, 4), "int8", offset_factor=1, scope=mem_scope)
+        C = T.match_buffer(c, (32,), "int32", offset_factor=1, scope=mem_scope)
         with T.block("root"):
             T.reads(C[0:32], A[0:4], B[0:32, 0:4])
             T.writes(C[0:32])
@@ -120,9 +120,9 @@ def dot_product_32x4_u8i8i32_desc(a: T.handle, b: T.handle, c: T.handle) -> None
 
     @T.prim_func
     def dot_product_32x4_u8i8i32_vrmpy(a: T.handle, b: T.handle, c: T.handle) -> None:
-        A = T.match_buffer(a, (4,), "uint8", offset_factor=1, scope=mem_scopes["reads"])
-        B = T.match_buffer(b, (32, 4), "int8", offset_factor=1, scope=mem_scopes["reads"])
-        C = T.match_buffer(c, (32,), "int32", offset_factor=1, scope=mem_scopes["write"])
+        A = T.match_buffer(a, (4,), "uint8", offset_factor=1, scope=mem_scope)
+        B = T.match_buffer(b, (32, 4), "int8", offset_factor=1, scope=mem_scope)
+        C = T.match_buffer(c, (32,), "int32", offset_factor=1, scope=mem_scope)
         with T.block("root"):
             T.reads(C[0:32], A[0:4], B[0:32, 0:4])
             T.writes(C[0:32])
@@ -145,12 +145,12 @@ def dot_product_32x4_u8i8i32_vrmpy(a: T.handle, b: T.handle, c: T.handle) -> Non
     return dot_product_32x4_u8i8i32_desc, dot_product_32x4_u8i8i32_vrmpy
 
 
-def generate_dot_product_32x2_i16i16i32(mem_scopes={"reads": "global", "write": "global"}):
+def generate_dot_product_32x2_i16i16i32(mem_scope="global"):
     @T.prim_func
     def dot_product_32x2_i16i16i32_desc(a: T.handle, b: T.handle, c: T.handle) -> None:
-        A = T.match_buffer(a, (2,), "int16", offset_factor=1, scope=mem_scopes["reads"])
-        B = T.match_buffer(b, (32, 2), "int16", offset_factor=1, scope=mem_scopes["reads"])
-        C = T.match_buffer(c, (32,), "int32", offset_factor=1, scope=mem_scopes["write"])
+        A = T.match_buffer(a, (2,), "int16", offset_factor=1, scope=mem_scope)
+        B = T.match_buffer(b, (32, 2), "int16", offset_factor=1, scope=mem_scope)
+        C = T.match_buffer(c, (32,), "int32", offset_factor=1, scope=mem_scope)
         with T.block("root"):
             T.reads(C[0:32], A[0:2], B[0:32, 0:2])
             T.writes(C[0:32])
@@ -162,9 +162,9 @@ def dot_product_32x2_i16i16i32_desc(a: T.handle, b: T.handle, c: T.handle) -> No
 
     @T.prim_func
     def dot_product_32x2_i16i16i32_vdmpy(a: T.handle, b: T.handle, c: T.handle) -> None:
-        A = T.match_buffer(a, (2,), "int16", offset_factor=1, scope=mem_scopes["reads"])
-        B = T.match_buffer(b, (32, 2), "int16", offset_factor=1, scope=mem_scopes["reads"])
-        C = T.match_buffer(c, (32,), "int32", offset_factor=1, scope=mem_scopes["write"])
+        A = T.match_buffer(a, (2,), "int16", offset_factor=1, scope=mem_scope)
+        B = T.match_buffer(b, (32, 2), "int16", offset_factor=1, scope=mem_scope)
+        C = T.match_buffer(c, (32,), "int32", offset_factor=1, scope=mem_scope)
         with T.block("root"):
             T.reads(C[0:32], A[0:2], B[0:32, 0:2])
             T.writes(C[0:32])
@@ -200,22 +200,10 @@ def dot_product_32x2_i16i16i32_vdmpy(a: T.handle, b: T.handle, c: T.handle) -> N
 TensorIntrin.register(VDMPY_i16i16i32_INTRIN, *generate_dot_product_32x2_i16i16i32())
 
 VRMPY_u8u8i32_VTCM_INTRIN = "dot_32x4_u8u8i32_vtcm_vrmpy"
-TensorIntrin.register(
-    VRMPY_u8u8i32_VTCM_INTRIN,
-    *generate_dot_product_32x4_u8u8i32({"reads": "global.vtcm", "write": "global.vtcm"}),
-)
-
-VRMPY_u8u8i32_VTCM_READS_INTRIN = "dot_32x4_u8u8i32_vtcm_reads_vrmpy"
-TensorIntrin.register(
-    VRMPY_u8u8i32_VTCM_READS_INTRIN,
-    *generate_dot_product_32x4_u8u8i32({"reads": "global.vtcm", "write": "global"}),
-)
+TensorIntrin.register(VRMPY_u8u8i32_VTCM_INTRIN, *generate_dot_product_32x4_u8u8i32("global.vtcm"))
 
 VRMPY_u8i8i32_VTCM_INTRIN = "dot_32x4_u8i8i32_vtcm_vrmpy"
-TensorIntrin.register(
-    VRMPY_u8i8i32_VTCM_INTRIN,
-    *generate_dot_product_32x4_u8i8i32({"reads": "global.vtcm", "write": "global.vtcm"}),
-)
+TensorIntrin.register(VRMPY_u8i8i32_VTCM_INTRIN, *generate_dot_product_32x4_u8i8i32("global.vtcm"))
 
 DMA_READ_1_u8 = "dma_read_1_u8"
 TensorIntrin.register(DMA_READ_1_u8, *generate_dma_load_intrin(1, "uint8"))

From a04de7c96c5ac19c1578a710a730f56bf1cd4783 Mon Sep 17 00:00:00 2001
From: Noah Verke <nverke@octoml.ai>
Date: Fri, 13 Jan 2023 16:12:57 -0800
Subject: [PATCH 3/4] Add comments, change queue id to -1, and remove tests
 from CI

---
 python/tvm/tir/tensor_intrin/hexagon.py       | 37 +++++++------------
 .../test_hexagon/test_vtcm_bandwidth.py       | 32 +++++++---------
 2 files changed, 26 insertions(+), 43 deletions(-)

diff --git a/python/tvm/tir/tensor_intrin/hexagon.py b/python/tvm/tir/tensor_intrin/hexagon.py
index d4c636018272..d5de862fc91d 100644
--- a/python/tvm/tir/tensor_intrin/hexagon.py
+++ b/python/tvm/tir/tensor_intrin/hexagon.py
@@ -27,7 +27,7 @@ def generate_dma_load_intrin(
     """Generator of dma_load intrins"""
 
     @T.prim_func
-    def dma_load_desc(a: T.handle, c: T.handle) -> None:
+    def sync_dma_load_desc(a: T.handle, c: T.handle) -> None:
         A = T.match_buffer(a, (size), dtype, offset_factor=1, scope="global")
         C = T.match_buffer(c, (size), dtype, offset_factor=1, scope="global.vtcm")
         with T.block("root"):
@@ -39,7 +39,7 @@ def dma_load_desc(a: T.handle, c: T.handle) -> None:
                     C[vii] = A[vii]
 
     @T.prim_func
-    def dma_load_impl(a: T.handle, c: T.handle) -> None:
+    def sync_dma_load_impl(a: T.handle, c: T.handle) -> None:
         A = T.match_buffer(a, (size), dtype, offset_factor=1, scope="global")
         C = T.match_buffer(c, (size), dtype, offset_factor=1, scope="global.vtcm")
         with T.block("root"):
@@ -48,17 +48,24 @@ def dma_load_impl(a: T.handle, c: T.handle) -> None:
             T.evaluate(
                 T.tvm_call_packed(
                     "device_api.hexagon.dma_copy",
-                    0,
+                    -1, #Use QueueId of -1 to not interfere with async copies. 
                     T.address_of(C[0], dtype="handle"),
                     T.address_of(A[0], dtype="handle"),
                     size,
-                    0,
+                    0, #Do not use experimental bypass mode. 
                     dtype="int32",
                 )
             )
-            T.evaluate(T.tvm_call_packed("device_api.hexagon.dma_wait", 0, 0, dtype="int32"))
+            T.evaluate(
+                T.tvm_call_packed(
+                    "device_api.hexagon.dma_wait",
+                    -1, 
+                    0, #Wait for the sync queue (-1) to have 0 messages. 
+                    dtype="int32"
+                )
+            )
 
-    return dma_load_desc, dma_load_impl
+    return sync_dma_load_desc, sync_dma_load_impl
 
 
 def generate_dot_product_32x4_u8u8i32(mem_scope="global"):
@@ -205,26 +212,8 @@ def dot_product_32x2_i16i16i32_vdmpy(a: T.handle, b: T.handle, c: T.handle) -> N
 VRMPY_u8i8i32_VTCM_INTRIN = "dot_32x4_u8i8i32_vtcm_vrmpy"
 TensorIntrin.register(VRMPY_u8i8i32_VTCM_INTRIN, *generate_dot_product_32x4_u8i8i32("global.vtcm"))
 
-DMA_READ_1_u8 = "dma_read_1_u8"
-TensorIntrin.register(DMA_READ_1_u8, *generate_dma_load_intrin(1, "uint8"))
-
-DMA_READ_1_i8 = "dma_read_1_i8"
-TensorIntrin.register(DMA_READ_1_i8, *generate_dma_load_intrin(1, "int8"))
-
 DMA_READ_128_u8 = "dma_read_128_u8"
 TensorIntrin.register(DMA_READ_128_u8, *generate_dma_load_intrin(128, "uint8"))
 
 DMA_READ_128_i8 = "dma_read_128_i8"
 TensorIntrin.register(DMA_READ_128_i8, *generate_dma_load_intrin(128, "int8"))
-
-DMA_READ_1024_u8 = "dma_read_1024_u8"
-TensorIntrin.register(DMA_READ_1024_u8, *generate_dma_load_intrin(1024, "uint8"))
-
-DMA_READ_1024_i8 = "dma_read_1024_i8"
-TensorIntrin.register(DMA_READ_1024_i8, *generate_dma_load_intrin(1024, "int8"))
-
-DMA_READ_4096_u8 = "dma_read_4096_u8"
-TensorIntrin.register(DMA_READ_4096_u8, *generate_dma_load_intrin(4096, "uint8"))
-
-DMA_READ_4096_i8 = "dma_read_4096_i8"
-TensorIntrin.register(DMA_READ_4096_i8, *generate_dma_load_intrin(4096, "int8"))
diff --git a/tests/python/contrib/test_hexagon/test_vtcm_bandwidth.py b/tests/python/contrib/test_hexagon/test_vtcm_bandwidth.py
index 97a4bdeca0f2..91876bb1e374 100644
--- a/tests/python/contrib/test_hexagon/test_vtcm_bandwidth.py
+++ b/tests/python/contrib/test_hexagon/test_vtcm_bandwidth.py
@@ -20,6 +20,7 @@
 import numpy as np
 
 import tvm
+import pytest
 from tvm.script import tir as T
 from tvm.tir.tensor_intrin.hexagon import DMA_READ_128_i8
 
@@ -105,13 +106,12 @@ def evaluate(hexagon_session, sch, size):
         a_vtcm, device=hexagon_session.device, mem_scope="global.vtcm"
     )
 
-    # These are reduced for CI but number=100 and repeat=10 does a good job of removing noise.
-    number = 10
-    repeat = 10
+    if tvm.testing.utils.IS_IN_CI:
+        # Run with reduced number and repeat for CI
+        timer = module.time_evaluator("__tvm_main__", hexagon_session.device, number=1, repeat=1)
+    else:
+        timer = module.time_evaluator("__tvm_main__", hexagon_session.device, number=10, repeat=10)
 
-    timer = module.time_evaluator(
-        "__tvm_main__", hexagon_session.device, number=number, repeat=repeat
-    )
     runtime = timer(a_hexagon, a_vtcm_hexagon)
 
     gbps = round((size / 2**30) / runtime.mean, 4)
@@ -126,20 +126,10 @@ class TestMatMulVec:
     # Removed most of these to speedup CI.
     size = tvm.testing.parameter(
         128,
-        256,
-        1024,
+        KB,
         10 * KB,
-        # 20 * KB,
-        # 40 * KB,
-        # 80 * KB,
-        # 160 * KB,
-        # 320 * KB,
-        640 * KB,
-        # MB,
-        2 * MB,
-        # 3 * MB,
-        # 4 * MB,
-        # 8 * MB,  # Only works on 8gen1 HDKs
+        100 * KB,
+        MB,
     )
 
     outer_split = tvm.testing.parameter(4)
@@ -149,6 +139,10 @@ class TestMatMulVec:
     @tvm.testing.requires_hexagon
     def test_bandwidth(self, hexagon_session, size, outer_split, unroll_split, vector_split):
         """Test bandwidth."""
+        
+        if tvm.testing.utils.IS_IN_CI and (size > 128):
+            pytest.skip("Skipping test since it takes too long in CI.")
+        
         # Run the base memcopy operator.
         sch = tvm.tir.Schedule(memcopy_operator(size))
         base_gpbs = evaluate(hexagon_session, sch, size)

From 51641c68d15c274d650faf52195349dd5b7ee1dc Mon Sep 17 00:00:00 2001
From: Noah Verke <nverke@octoml.ai>
Date: Fri, 13 Jan 2023 16:13:22 -0800
Subject: [PATCH 4/4] lint changes

---
 python/tvm/tir/tensor_intrin/hexagon.py                | 10 +++++-----
 .../python/contrib/test_hexagon/test_vtcm_bandwidth.py |  4 ++--
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/python/tvm/tir/tensor_intrin/hexagon.py b/python/tvm/tir/tensor_intrin/hexagon.py
index d5de862fc91d..7a348f3f1a45 100644
--- a/python/tvm/tir/tensor_intrin/hexagon.py
+++ b/python/tvm/tir/tensor_intrin/hexagon.py
@@ -48,20 +48,20 @@ def sync_dma_load_impl(a: T.handle, c: T.handle) -> None:
             T.evaluate(
                 T.tvm_call_packed(
                     "device_api.hexagon.dma_copy",
-                    -1, #Use QueueId of -1 to not interfere with async copies. 
+                    -1,  # Use QueueId of -1 to not interfere with async copies.
                     T.address_of(C[0], dtype="handle"),
                     T.address_of(A[0], dtype="handle"),
                     size,
-                    0, #Do not use experimental bypass mode. 
+                    0,  # Do not use experimental bypass mode.
                     dtype="int32",
                 )
             )
             T.evaluate(
                 T.tvm_call_packed(
                     "device_api.hexagon.dma_wait",
-                    -1, 
-                    0, #Wait for the sync queue (-1) to have 0 messages. 
-                    dtype="int32"
+                    -1,
+                    0,  # Wait for the sync queue (-1) to have 0 messages.
+                    dtype="int32",
                 )
             )
 
diff --git a/tests/python/contrib/test_hexagon/test_vtcm_bandwidth.py b/tests/python/contrib/test_hexagon/test_vtcm_bandwidth.py
index 91876bb1e374..53d0428a5ad1 100644
--- a/tests/python/contrib/test_hexagon/test_vtcm_bandwidth.py
+++ b/tests/python/contrib/test_hexagon/test_vtcm_bandwidth.py
@@ -139,10 +139,10 @@ class TestMatMulVec:
     @tvm.testing.requires_hexagon
     def test_bandwidth(self, hexagon_session, size, outer_split, unroll_split, vector_split):
         """Test bandwidth."""
-        
+
         if tvm.testing.utils.IS_IN_CI and (size > 128):
             pytest.skip("Skipping test since it takes too long in CI.")
-        
+
         # Run the base memcopy operator.
         sch = tvm.tir.Schedule(memcopy_operator(size))
         base_gpbs = evaluate(hexagon_session, sch, size)