apache · tqchen · Jun 24, 2024 · Jun 19, 2024
diff --git a/include/tvm/tir/builtin.h b/include/tvm/tir/builtin.h
@@ -746,7 +746,7 @@ TVM_DLL const Op& create_barriers();
 TVM_DLL const Op& mma_store();
 
 /*!
- * \brief tvm intrinsic for zero-initalizing an MMA accumulation registor.
+ * \brief tvm intrinsic for zero-initializing an MMA accumulation register.
  *        For example, if each thread in a warp of size 32 has 8 elements from the A matrix in
  *        m16xn8xk16 MMA in its registers, this intrinsic can be used to zero-initialize its
  *        4 accumulation registers.
@@ -758,6 +758,48 @@ TVM_DLL const Op& mma_store();
  */
 TVM_DLL const Op& mma_fill();
 
+// Metal SimdGroup matrix intrinsics
+
+/*!
+ * \brief tvm intrinsic for initializing and simdgroup with given value.
+ * \note only 8x8 shape is supported by Metal Spec and TVM, but we still keep shape as params,
+ *       keeping the similar interface with Metal Spec.
+ *
+ * void make_filled_simdgroup_matrix(Var d, PrimExpr index, PrimExpr value,
+ *                                   int col = 8, int row = 8);
+ */
+TVM_DLL const Op& make_filled_simdgroup_matrix();
+
+/*!
+ * \brief tvm intrinsic for loading data from device memory or threadgroup memory to simdgroup.
+ * \note only 8x8 shape is supported by Metal Spec and TVM, but we still keep shape as params,
+ *       keeping the similar interface with Metal Spec.
+ *
+ * void simdgroup_load(Var d, PrimExpr index, PrimExpr ptr, PrimExpr stride,
+                       int col = 8, int row = 8, bool transpose_matrix = false);
+ */
+TVM_DLL const Op& simdgroup_load();
+
+/*!
+ * \brief tvm intrinsic for storing data from simdgroup to device memory or threadgroup memory.
+ * \note only 8x8 shape is supported by Metal Spec and TVM, but we still keep shape as params,
+ *       keeping the similar interface with Metal Spec.
+ *
+ * void simdgroup_store(Var d, PrimExpr index, PrimExpr ptr, PrimExpr stride,
+ *                      int col = 8, int row = 8, bool transpose_matrix = false);
+ */
+TVM_DLL const Op& simdgroup_store();
+
+/*!
+ * \brief tvm intrinsic for multiply and accumulate two matrices in simdgroup
+ * \note only 8x8 shape is supported by Metal Spec and TVM, but we still keep shape as params,
+ *       keeping the similar interface with Metal Spec.
+ *
+ * void simdgroup_mma(Var d, PrimExpr index_d, Var a, PrimExpr index_a,
+ *                    Var b, PrimExpr index_b, Var c, PrimExpr index_c);
+ */
+TVM_DLL const Op& simdgroup_multiply_accumulate();
+
 // TODO(tvm-team) replace the usage of the vector operations by Shuffle.
 /*!
  * \brief Get the high level half of the vector

diff --git a/python/tvm/dlight/gpu/matmul.py b/python/tvm/dlight/gpu/matmul.py
@@ -313,6 +313,146 @@ def check_sm_version(arch: str) -> int:
     return int(sm_version) if sm_version.isdigit() else -1
 
 
+class MetalMatmul(GPUScheduleRule):
+    """
+    The schedule rule for Metal matmul computation.
+    """
+
+    def apply(  # pylint: disable=too-many-locals,missing-docstring
+        self,
+        func: tir.PrimFunc,
+        target: Target,
+        _: bool,
+    ) -> Optional[tir.Schedule]:
+        from tvm.tir.tensor_intrin.metal import (  # pylint: disable=import-outside-toplevel
+            get_simdgroup_intrin_group,
+        )
+
+        if not isinstance(func, tir.PrimFunc) or not self.is_target_available(target):
+            return None
+        sch = tir.Schedule(func)
+        root_block = analysis.get_root_block(sch)
+        blocks = sch.get_child_blocks(root_block)
+
+        reduction_blocks = get_reduction_blocks(sch, blocks)
+        if reduction_blocks is None:
+            return None
+
+        main_block = reduction_blocks[0]
+        block_stmt = sch.get(main_block)
+        index_maps = get_index_map(block_stmt)
+        if index_maps is None:
+            return None
+        matmul_index_map, a_index_map, b_index_map, c_index_map = index_maps
+
+        # Step 0. Configs
+        block_size_x: int = 16
+        block_size_y: int = 16
+        block_size_k: int = 32
+        micro_size: int = 8
+        warp_size: int = 32
+        ty_len: int = 1
+        tz_len: int = 4
+        vector_size: int = 4
+
+        # Step 1. Normalize generic matmul to C[S, I, J] += A[S, I, K] * B[S, J, K]
+        block = sch.reindex(main_block, ("read", 0))
+        sch.transform_layout(block, ("write", 0), a_index_map)
+        block = sch.reindex(main_block, ("read", 1))
+        sch.transform_layout(block, ("write", 0), b_index_map)
+        block = sch.reindex(main_block, ("write", 0))
+        sch.transform_layout(block, ("read", 0), c_index_map)
+        sch.transform_block_layout(main_block, matmul_index_map)
+
+        # Step 2. Padding for dynamic shape kernels
+        sch.pad_einsum(
+            main_block,
+            [
+                1,
+                ty_len * block_size_x,
+                tz_len * block_size_y,
+                block_size_k,
+            ],
+        )
+
+        # Step 3. Schedule matmul to use simdgroup intrinsics
+        batch, i, j, k = sch.get_loops(main_block)
+        bx, ty, i0, i1 = sch.split(i, [None, ty_len, block_size_x // micro_size, micro_size])
+        by, tz, j0, j1 = sch.split(j, [None, tz_len, block_size_y // micro_size, micro_size])
+        k0, k1, k2 = sch.split(k, [None, block_size_k // micro_size, micro_size])
+        sch.reorder(bx, by, ty, tz, k0, k1, i0, j0, i1, j1, k2)
+        sch.bind(bx, "blockIdx.x")
+        sch.bind(by, "blockIdx.y")
+        sch.bind(batch, "blockIdx.z")
+        sch.bind(ty, "threadIdx.y")
+        sch.bind(tz, "threadIdx.z")
+
+        def fetch_to_shared(block, idx):
+            block_read = sch.cache_read(block, idx, "shared")
+            sch.compute_at(block_read, k0, preserve_unit_loops=True)
+            fused = sch.fuse(*sch.get_loops(block_read)[-2:])
+            _, _tz, _ty, _tx, vec = sch.split(fused, [None, tz_len, ty_len, warp_size, vector_size])
+
+            sch.bind(_tz, "threadIdx.z")
+            sch.bind(_ty, "threadIdx.y")
+            sch.bind(_tx, "threadIdx.x")
+            sch.vectorize(vec)
+
+            return block_read
+
+        a_g2s = fetch_to_shared(main_block, 0)
+        b_g2s = fetch_to_shared(main_block, 1)
+
+        auto_inline_producers(sch, a_g2s)
+        auto_inline_producers(sch, b_g2s)
+
+        # create read cache to load matrix from shared memory to wmma fragments
+        A_simdgroup = sch.cache_read(main_block, 0, "metal.simdgroup")
+        B_simdgroup = sch.cache_read(main_block, 1, "metal.simdgroup")
+        sch.compute_at(A_simdgroup, k1)
+        sch.compute_at(B_simdgroup, k1)
+
+        C_simd2s = sch.cache_write(main_block, 0, "metal.simdgroup")
+        C_s2g = sch.cache_write(C_simd2s, 0, "shared")
+        sch.reverse_compute_at(C_simd2s, tz, preserve_unit_loops=True)
+        sch.reverse_compute_at(C_s2g, by, preserve_unit_loops=True)
+
+        intrin_group = get_simdgroup_intrin_group(
+            load_scope="shared",
+            store_scope="shared",
+            dtype="float16",
+            trans_a=False,
+            trans_b=True,
+        )
+        sch.transform_layout(B_simdgroup, ("write", 0), lambda s, i, j: (s, j, i))
+
+        def tensorize_block(block: tir.schedule.BlockRV, intrin: str):
+            *_, i, j = sch.get_loops(block)
+            io, ii = sch.split(i, [None, micro_size])
+            jo, ji = sch.split(j, [None, micro_size])
+            sch.reorder(io, jo, ii, ji)
+            sch.tensorize(ii, intrin)
+
+        C_init = sch.decompose_reduction(main_block, k0)
+        tensorize_block(A_simdgroup, intrin_group["load_a"])
+        tensorize_block(B_simdgroup, intrin_group["load_b"])
+        tensorize_block(C_simd2s, intrin_group["store"])
+        tensorize_block(C_init, intrin_group["init"])
+
+        *_, i, j, k = sch.get_loops(main_block)
+        sch.tensorize(i, intrin_group["compute"])
+
+        auto_inline_consumer_chain(sch, C_s2g)
+        fused = sch.fuse(*sch.get_loops(C_s2g)[-2:])
+        _, _tz, _ty, _tx, vec = sch.split(fused, [None, tz_len, ty_len, warp_size, vector_size])
+        sch.bind(_tz, "threadIdx.z")
+        sch.bind(_ty, "threadIdx.y")
+        sch.bind(_tx, "threadIdx.x")
+        sch.vectorize(vec)
+
+        return sch
+
+
 class MatmulTensorization(GPUScheduleRule):
     """
     The schedule rule for float16 tensor core matmul computation.
@@ -848,6 +988,11 @@ def apply(  # pylint: disable=too-many-locals,missing-docstring
                     tensorize_sch = MatmulTensorization().apply(func, target, _)
                 if tensorize_sch is not None:
                     return tensorize_sch
+        elif target.kind.name == "metal":
+            try:
+                return MetalMatmul().apply(func, target, _)
+            except:  # pylint: disable=bare-except
+                pass
 
         # Step 2. Get schedule config.
         config = self.get_configs(target)

diff --git a/python/tvm/script/ir_builder/tir/ir.py b/python/tvm/script/ir_builder/tir/ir.py
@@ -1887,6 +1887,10 @@ def wrapped(*args, **kwargs):
 ptx_arrive_barrier = _op_wrapper(_tir_op.ptx_arrive_barrier)
 ptx_arrive_barrier_expect_tx = _op_wrapper(_tir_op.ptx_arrive_barrier_expect_tx)
 ptx_wait_barrier = _op_wrapper(_tir_op.ptx_wait_barrier)
+make_filled_simdgroup_matrix = _op_wrapper(_tir_op.make_filled_simdgroup_matrix)
+simdgroup_load = _op_wrapper(_tir_op.simdgroup_load)
+simdgroup_store = _op_wrapper(_tir_op.simdgroup_store)
+simdgroup_multiply_accumulate = _op_wrapper(_tir_op.simdgroup_multiply_accumulate)
 create_barriers = _op_wrapper(_tir_op.create_barriers)
 assume = _op_wrapper(_tir_op.assume)
 undef = _op_wrapper(_tir_op.undef)
@@ -2177,6 +2181,10 @@ def wrapped(*args, **kwargs):
     "ptx_arrive_barrier",
     "ptx_arrive_barrier_expect_tx",
     "ptx_wait_barrier",
+    "make_filled_simdgroup_matrix",
+    "simdgroup_load",
+    "simdgroup_store",
+    "simdgroup_multiply_accumulate",
     "create_barriers",
     "mma_store",
     "mma_fill",

diff --git a/python/tvm/tir/__init__.py b/python/tvm/tir/__init__.py
@@ -73,6 +73,12 @@
     ptx_wait_barrier,
     create_barriers,
 )
+from .op import (
+    make_filled_simdgroup_matrix,
+    simdgroup_load,
+    simdgroup_multiply_accumulate,
+    simdgroup_store,
+)
 from .op import vectorlow, vectorhigh, vectorcombine
 from .op import infinity, reinterpret
 from .op import exp, exp2, exp10, log, log2, log10, log1p, ldexp, clz