Add examples of transform dialect for a special accelerator (#203)

* Add examples of matmul lowering for a special accelerator * Minor modifications to `matmul_codegen_spec_pad.mlir`. - Make "y" the outermost distribution dimension, and "x" the innermost - Make tile sizes for local divide the tile sizes for shared - Add a pass to hoist the static allocations to the "example script header". --------- Co-authored-by: MaheshRavishankar <mahesh@nod-labs.com>
iree-org · Sep 22, 2023 · bdd6265 · bdd6265
1 parent dacd37a
commit bdd6265
Show file tree

Hide file tree

Showing 3 changed files with 153 additions and 0 deletions.
diff --git a/transform_dialect/examples/accel/matmul_codegen_spec.mlir b/transform_dialect/examples/accel/matmul_codegen_spec.mlir
@@ -0,0 +1,65 @@
+// This script shows a basic example lowering matmul through IREE for a special accelerator.
+//
+// ```
+//   export IREE_DIR=${HOME}/github/iree; \
+//   export IREE_SAMPLES_DIR=${HOME}/github/iree-samples; \
+//   ${IREE_DIR}/build/tools/iree-opt \
+//     ${IREE_SAMPLES_DIR}/transform_dialect/examples/accel/matmul_source.mlir \
+//     --iree-hal-target-backends=llvm-cpu \
+//     --iree-abi-transformation-pipeline \
+//     --iree-flow-transformation-pipeline \
+//     --iree-stream-transformation-pipeline \
+//     --iree-hal-configuration-pipeline | \
+//   ${IREE_DIR}/build/tools/iree-opt \
+//      --pass-pipeline='builtin.module(hal.executable(hal.executable.variant(iree-llvmcpu-lower-executable-target)))' \
+//      --iree-codegen-llvmcpu-use-transform-dialect=${IREE_SAMPLES_DIR}/transform_dialect/examples/accel/matmul_codegen_spec.mlir
+// ```
+
+module attributes { transform.with_named_sequence } {
+  transform.named_sequence @cleanup(%variant_op: !transform.any_op {transform.readonly}) {
+    %func = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op
+      transform.apply_patterns to %func {
+      transform.apply_patterns.linalg.tiling_canonicalization
+      transform.apply_patterns.iree.fold_fill_into_pad
+      transform.apply_patterns.scf.for_loop_canonicalization
+      transform.apply_patterns.canonicalization
+    } : !transform.any_op
+    transform.iree.apply_licm %func : !transform.any_op
+    transform.iree.apply_cse %func : !transform.any_op
+    transform.yield
+  }
+
+  transform.sequence failures(propagate) {
+  ^bb1(%variant_op: !transform.any_op):
+    %matmul = transform.structured.match ops{["linalg.matmul"]} in %variant_op : (!transform.any_op) -> !transform.any_op
+
+    // First level tile to forall with tile_sizes [15, 20].
+    %forall, %tiled_matmul =
+      transform.structured.tile_to_forall_op %matmul tile_sizes [15, 20]
+        ( mapping = [#gpu.block<x>, #gpu.block<y>] ) : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+    transform.iree.populate_workgroup_count_region_using_num_threads_slice %forall
+      : (!transform.any_op) -> ()
+
+    // Tile reduction dimension.
+    %tiled_reduction, %loop =
+      transform.structured.tile %tiled_matmul [0, 0, 10]
+        : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+
+    // Second level tile to forall with tile_sizes [5, 10].
+    %forall_1, %tiled_matmul_1 =
+      transform.structured.tile_to_forall_op %tiled_reduction tile_sizes [5, 10]
+        ( mapping = [#gpu.thread<x>, #gpu.thread<y>] ) : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+
+    // Clean up.
+    transform.include @cleanup failures(propagate) (%variant_op) : (!transform.any_op) -> ()
+    transform.iree.eliminate_empty_tensors %variant_op : (!transform.any_op) -> ()
+
+    // Bufferize and drop HAL decriptor from memref ops.
+    %variant_op_3 = transform.iree.bufferize %variant_op : (!transform.any_op) -> !transform.any_op
+
+    // Post-bufferization mapping workgroup.
+    %memref_func = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
+    transform.iree.forall_to_workgroup %memref_func : (!transform.any_op) -> ()
+    transform.iree.map_nested_forall_to_gpu_threads %memref_func workgroup_dims = [3, 2, 1] subgroup_size = 8 : (!transform.any_op) -> ()
+  }
+}
diff --git a/transform_dialect/examples/accel/matmul_codegen_spec_pad.mlir b/transform_dialect/examples/accel/matmul_codegen_spec_pad.mlir
@@ -0,0 +1,82 @@
+// This script shows an example lowering matmul through IREE for a special accelerator.
+//
+// ```
+//   export IREE_DIR=${HOME}/github/iree; \
+//   export IREE_SAMPLES_DIR=${HOME}/github/iree-samples; \
+//   ${IREE_DIR}/build/tools/iree-opt \
+//     ${IREE_SAMPLES_DIR}/transform_dialect/examples/accel/matmul_source.mlir \
+//     --iree-hal-target-backends=llvm-cpu \
+//     --iree-abi-transformation-pipeline \
+//     --iree-flow-transformation-pipeline \
+//     --iree-stream-transformation-pipeline \
+//     --iree-hal-configuration-pipeline | \
+//   ${IREE_DIR}/build/tools/iree-opt \
+//      --pass-pipeline='builtin.module(hal.executable(hal.executable.variant(iree-llvmcpu-lower-executable-target, builtin.module(func.func(iree-hoist-statically-bound-allocations)))))' \
+//      --iree-codegen-llvmcpu-use-transform-dialect=${IREE_SAMPLES_DIR}/transform_dialect/examples/accel/matmul_codegen_spec_pad.mlir
+// ```
+
+module attributes { transform.with_named_sequence } {
+  transform.named_sequence @cleanup(%variant_op: !transform.any_op {transform.readonly}) {
+    %func = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op
+      transform.apply_patterns to %func {
+      transform.apply_patterns.linalg.tiling_canonicalization
+      transform.apply_patterns.iree.fold_fill_into_pad
+      transform.apply_patterns.scf.for_loop_canonicalization
+      transform.apply_patterns.canonicalization
+    } : !transform.any_op
+    transform.iree.apply_licm %func : !transform.any_op
+    transform.iree.apply_cse %func : !transform.any_op
+    transform.yield
+  }
+
+  transform.sequence failures(propagate) {
+  ^bb1(%variant_op: !transform.any_op):
+    %matmul = transform.structured.match ops{["linalg.matmul"]} in %variant_op : (!transform.any_op) -> !transform.any_op
+
+    // First level tile to forall with tile_sizes [16, 32].
+    %forall, %tiled_matmul =
+      transform.structured.tile_to_forall_op %matmul tile_sizes [16, 32]
+        ( mapping = [#gpu.block<y>, #gpu.block<x>] ) : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+    transform.iree.populate_workgroup_count_region_using_num_threads_slice %forall
+      : (!transform.any_op) -> ()
+
+    // Tile reduction dimension.
+    %tiled_reduction, %loop =
+      transform.structured.tile %tiled_matmul [0, 0, 8]
+      : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+
+    // Pad operation.
+    %padded, %pad, %__ = transform.structured.pad %tiled_reduction {
+      padding_values=[0.0 : f32, 0.0 : f32, 0.0 : f32],
+      padding_dimensions=[0, 1, 2],
+      pack_paddings=[1, 1, 0],
+      copy_back_op="none"
+    } : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op)
+
+    // Second level tile to forall with tile_sizes [8, 8].
+    %forall_1, %tiled_matmul_1 =
+      transform.structured.tile_to_forall_op %padded tile_sizes [8, 8]
+        ( mapping = [#gpu.thread<y>, #gpu.thread<x>] ) : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+
+    // Pad operation.
+    %padded_1, %pad_1, %_ = transform.structured.pad %tiled_matmul_1 {
+      padding_values=[0.0 : f32, 0.0 : f32, 0.0 : f32],
+      padding_dimensions=[0, 1, 2],
+      pack_paddings=[0, 0, 1],
+      copy_back_op="none"
+    } : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op)
+
+    // Clean up.
+    transform.include @cleanup failures(propagate) (%variant_op) : (!transform.any_op) -> ()
+    transform.iree.eliminate_empty_tensors %variant_op : (!transform.any_op) -> ()
+
+    // Bufferize and drop HAL decriptor from memref ops.
+    %variant_op_3 = transform.iree.bufferize %variant_op : (!transform.any_op) -> !transform.any_op
+
+    // Post-bufferization mapping workgroup.
+    %memref_func = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
+    transform.iree.forall_to_workgroup %memref_func : (!transform.any_op) -> ()
+    transform.iree.map_nested_forall_to_gpu_threads %memref_func workgroup_dims = [4, 2, 1] subgroup_size = 1 : (!transform.any_op) -> ()
+    transform.iree.hoist_static_alloc %memref_func : (!transform.any_op) -> ()
+  }
+}
diff --git a/transform_dialect/examples/accel/matmul_source.mlir b/transform_dialect/examples/accel/matmul_source.mlir
@@ -0,0 +1,6 @@
+func.func @matmul_example(%lhs: tensor<300x200xf32>, %rhs: tensor<200x100xf32>, %init : tensor<300x100xf32>) -> tensor<300x100xf32>
+{
+  %res = linalg.matmul ins(%lhs, %rhs: tensor<300x200xf32>, tensor<200x100xf32>)
+                    outs(%init: tensor<300x100xf32>) -> tensor<300x100xf32>
+  return %res : tensor<300x100xf32>
+}