Skip to content

Commit

Permalink
Add examples of transform dialect for a special accelerator (#203)
Browse files Browse the repository at this point in the history
* Add examples of matmul lowering for a special accelerator

* Minor modifications to `matmul_codegen_spec_pad.mlir`.

- Make "y" the outermost distribution dimension, and "x" the innermost
- Make tile sizes for local divide the tile sizes for shared
- Add a pass to hoist the static allocations to the "example script header".

---------

Co-authored-by: MaheshRavishankar <mahesh@nod-labs.com>
  • Loading branch information
yzhang93 and MaheshRavishankar authored Sep 22, 2023
1 parent dacd37a commit bdd6265
Show file tree
Hide file tree
Showing 3 changed files with 153 additions and 0 deletions.
65 changes: 65 additions & 0 deletions transform_dialect/examples/accel/matmul_codegen_spec.mlir
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
// This script shows a basic example lowering matmul through IREE for a special accelerator.
//
// ```
// export IREE_DIR=${HOME}/github/iree; \
// export IREE_SAMPLES_DIR=${HOME}/github/iree-samples; \
// ${IREE_DIR}/build/tools/iree-opt \
// ${IREE_SAMPLES_DIR}/transform_dialect/examples/accel/matmul_source.mlir \
// --iree-hal-target-backends=llvm-cpu \
// --iree-abi-transformation-pipeline \
// --iree-flow-transformation-pipeline \
// --iree-stream-transformation-pipeline \
// --iree-hal-configuration-pipeline | \
// ${IREE_DIR}/build/tools/iree-opt \
// --pass-pipeline='builtin.module(hal.executable(hal.executable.variant(iree-llvmcpu-lower-executable-target)))' \
// --iree-codegen-llvmcpu-use-transform-dialect=${IREE_SAMPLES_DIR}/transform_dialect/examples/accel/matmul_codegen_spec.mlir
// ```

module attributes { transform.with_named_sequence } {
transform.named_sequence @cleanup(%variant_op: !transform.any_op {transform.readonly}) {
%func = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op
transform.apply_patterns to %func {
transform.apply_patterns.linalg.tiling_canonicalization
transform.apply_patterns.iree.fold_fill_into_pad
transform.apply_patterns.scf.for_loop_canonicalization
transform.apply_patterns.canonicalization
} : !transform.any_op
transform.iree.apply_licm %func : !transform.any_op
transform.iree.apply_cse %func : !transform.any_op
transform.yield
}

transform.sequence failures(propagate) {
^bb1(%variant_op: !transform.any_op):
%matmul = transform.structured.match ops{["linalg.matmul"]} in %variant_op : (!transform.any_op) -> !transform.any_op

// First level tile to forall with tile_sizes [15, 20].
%forall, %tiled_matmul =
transform.structured.tile_to_forall_op %matmul tile_sizes [15, 20]
( mapping = [#gpu.block<x>, #gpu.block<y>] ) : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
transform.iree.populate_workgroup_count_region_using_num_threads_slice %forall
: (!transform.any_op) -> ()

// Tile reduction dimension.
%tiled_reduction, %loop =
transform.structured.tile %tiled_matmul [0, 0, 10]
: (!transform.any_op) -> (!transform.any_op, !transform.any_op)

// Second level tile to forall with tile_sizes [5, 10].
%forall_1, %tiled_matmul_1 =
transform.structured.tile_to_forall_op %tiled_reduction tile_sizes [5, 10]
( mapping = [#gpu.thread<x>, #gpu.thread<y>] ) : (!transform.any_op) -> (!transform.any_op, !transform.any_op)

// Clean up.
transform.include @cleanup failures(propagate) (%variant_op) : (!transform.any_op) -> ()
transform.iree.eliminate_empty_tensors %variant_op : (!transform.any_op) -> ()

// Bufferize and drop HAL decriptor from memref ops.
%variant_op_3 = transform.iree.bufferize %variant_op : (!transform.any_op) -> !transform.any_op

// Post-bufferization mapping workgroup.
%memref_func = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
transform.iree.forall_to_workgroup %memref_func : (!transform.any_op) -> ()
transform.iree.map_nested_forall_to_gpu_threads %memref_func workgroup_dims = [3, 2, 1] subgroup_size = 8 : (!transform.any_op) -> ()
}
}
82 changes: 82 additions & 0 deletions transform_dialect/examples/accel/matmul_codegen_spec_pad.mlir
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
// This script shows an example lowering matmul through IREE for a special accelerator.
//
// ```
// export IREE_DIR=${HOME}/github/iree; \
// export IREE_SAMPLES_DIR=${HOME}/github/iree-samples; \
// ${IREE_DIR}/build/tools/iree-opt \
// ${IREE_SAMPLES_DIR}/transform_dialect/examples/accel/matmul_source.mlir \
// --iree-hal-target-backends=llvm-cpu \
// --iree-abi-transformation-pipeline \
// --iree-flow-transformation-pipeline \
// --iree-stream-transformation-pipeline \
// --iree-hal-configuration-pipeline | \
// ${IREE_DIR}/build/tools/iree-opt \
// --pass-pipeline='builtin.module(hal.executable(hal.executable.variant(iree-llvmcpu-lower-executable-target, builtin.module(func.func(iree-hoist-statically-bound-allocations)))))' \
// --iree-codegen-llvmcpu-use-transform-dialect=${IREE_SAMPLES_DIR}/transform_dialect/examples/accel/matmul_codegen_spec_pad.mlir
// ```

module attributes { transform.with_named_sequence } {
transform.named_sequence @cleanup(%variant_op: !transform.any_op {transform.readonly}) {
%func = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op
transform.apply_patterns to %func {
transform.apply_patterns.linalg.tiling_canonicalization
transform.apply_patterns.iree.fold_fill_into_pad
transform.apply_patterns.scf.for_loop_canonicalization
transform.apply_patterns.canonicalization
} : !transform.any_op
transform.iree.apply_licm %func : !transform.any_op
transform.iree.apply_cse %func : !transform.any_op
transform.yield
}

transform.sequence failures(propagate) {
^bb1(%variant_op: !transform.any_op):
%matmul = transform.structured.match ops{["linalg.matmul"]} in %variant_op : (!transform.any_op) -> !transform.any_op

// First level tile to forall with tile_sizes [16, 32].
%forall, %tiled_matmul =
transform.structured.tile_to_forall_op %matmul tile_sizes [16, 32]
( mapping = [#gpu.block<y>, #gpu.block<x>] ) : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
transform.iree.populate_workgroup_count_region_using_num_threads_slice %forall
: (!transform.any_op) -> ()

// Tile reduction dimension.
%tiled_reduction, %loop =
transform.structured.tile %tiled_matmul [0, 0, 8]
: (!transform.any_op) -> (!transform.any_op, !transform.any_op)

// Pad operation.
%padded, %pad, %__ = transform.structured.pad %tiled_reduction {
padding_values=[0.0 : f32, 0.0 : f32, 0.0 : f32],
padding_dimensions=[0, 1, 2],
pack_paddings=[1, 1, 0],
copy_back_op="none"
} : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op)

// Second level tile to forall with tile_sizes [8, 8].
%forall_1, %tiled_matmul_1 =
transform.structured.tile_to_forall_op %padded tile_sizes [8, 8]
( mapping = [#gpu.thread<y>, #gpu.thread<x>] ) : (!transform.any_op) -> (!transform.any_op, !transform.any_op)

// Pad operation.
%padded_1, %pad_1, %_ = transform.structured.pad %tiled_matmul_1 {
padding_values=[0.0 : f32, 0.0 : f32, 0.0 : f32],
padding_dimensions=[0, 1, 2],
pack_paddings=[0, 0, 1],
copy_back_op="none"
} : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op)

// Clean up.
transform.include @cleanup failures(propagate) (%variant_op) : (!transform.any_op) -> ()
transform.iree.eliminate_empty_tensors %variant_op : (!transform.any_op) -> ()

// Bufferize and drop HAL decriptor from memref ops.
%variant_op_3 = transform.iree.bufferize %variant_op : (!transform.any_op) -> !transform.any_op

// Post-bufferization mapping workgroup.
%memref_func = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
transform.iree.forall_to_workgroup %memref_func : (!transform.any_op) -> ()
transform.iree.map_nested_forall_to_gpu_threads %memref_func workgroup_dims = [4, 2, 1] subgroup_size = 1 : (!transform.any_op) -> ()
transform.iree.hoist_static_alloc %memref_func : (!transform.any_op) -> ()
}
}
6 changes: 6 additions & 0 deletions transform_dialect/examples/accel/matmul_source.mlir
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
func.func @matmul_example(%lhs: tensor<300x200xf32>, %rhs: tensor<200x100xf32>, %init : tensor<300x100xf32>) -> tensor<300x100xf32>
{
%res = linalg.matmul ins(%lhs, %rhs: tensor<300x200xf32>, tensor<200x100xf32>)
outs(%init: tensor<300x100xf32>) -> tensor<300x100xf32>
return %res : tensor<300x100xf32>
}

0 comments on commit bdd6265

Please sign in to comment.