Skip to content

Commit

Permalink
squashed
Browse files Browse the repository at this point in the history
  • Loading branch information
newling committed Oct 2, 2024
1 parent a88304a commit b9f4242
Show file tree
Hide file tree
Showing 7 changed files with 167 additions and 103 deletions.
4 changes: 2 additions & 2 deletions build_tools/ci/cpu_comparison/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -662,7 +662,7 @@ def run(self, config):
config,
test_name,
tile_pipeline="conv-decompose",
lower_to_aie_pipeline="air",
lower_to_aie_pipeline="objectFifo",
n_repeats=n_conv_repeats,
)

Expand All @@ -682,7 +682,7 @@ def run(self, config):
config,
test_files_dir / f"{name}.mlir",
tile_pipeline="conv-decompose",
lower_to_aie_pipeline="air",
lower_to_aie_pipeline="objectFifo",
n_repeats=n_conv_repeats,
)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ iree_lit_test_suite(
NAME
lit
SRCS
"conv2d_nhwc_air_e2e.mlir"
"conv2d_nhwc_objectfifo_e2e.mlir"
"matmul_elementwise_pack_peel_air_e2e.mlir"
"matmul_pack_peel_air_e2e.mlir"
"matmul_pack_peel_objectfifo.mlir"
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// RUN: iree-compile --iree-hal-target-backends=amd-aie --compile-to=executable-sources %s | iree-opt --pass-pipeline="builtin.module(hal.executable(hal.executable.variant(iree-hal-translate-target-executable-variants{target=amd-aie})))" --iree-amdaie-tile-pipeline=conv-decompose --iree-amdaie-lower-to-aie-pipeline=air --split-input-file | FileCheck %s
// RUN: iree-compile --iree-hal-target-backends=amd-aie --compile-to=executable-sources %s | iree-opt --pass-pipeline="builtin.module(hal.executable(hal.executable.variant(iree-hal-translate-target-executable-variants{target=amd-aie})))" --iree-amdaie-tile-pipeline=conv-decompose --iree-amdaie-lower-to-aie-pipeline=objectFifo --split-input-file | FileCheck %s

func.func @conv_2d_nhwc_hwcf(%arg0: tensor<2x14x14x32xi32>, %arg1: tensor<3x3x32x64xi32>) -> tensor<2x12x12x64xi32> {
%cst = arith.constant 0 : i32
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,11 @@
#include "iree-amd-aie/IR/AMDAIEAttrs.h"
#include "iree-amd-aie/Transforms/Passes.h"
#include "iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenAttrs.h"
#include "mlir/Dialect/Linalg/IR/LinalgInterfaces.h"
#include "mlir/Dialect/Linalg/Transforms/Transforms.h"
#include "mlir/Dialect/Tensor/IR/Tensor.h"
#include "mlir/Dialect/Utils/StaticValueUtils.h"
#include "mlir/IR/Diagnostics.h"
#include "mlir/Pass/Pass.h"

#define DEBUG_TYPE "iree-amdaie-pack-and-transpose"
Expand All @@ -20,7 +23,6 @@ namespace {
static FailureOr<linalg::PackResult> applyPackOnLinalgOp(
RewriterBase &rewriter, linalg::LinalgOp op,
SmallVector<OpFoldResult> packedSizes) {
// Fail on mismatched number of pack sizes.
if (packedSizes.size() != op.getNumLoops()) {
op->emitOpError(
"requires number of packed sizes match the number of loops (")
Expand All @@ -29,12 +31,14 @@ static FailureOr<linalg::PackResult> applyPackOnLinalgOp(
}

rewriter.setInsertionPoint(op);
FailureOr<linalg::PackResult> packResult =
FailureOr<linalg::PackResult> maybePackResult =
linalg::pack(rewriter, op, packedSizes);
if (failed(packResult)) {
if (failed(maybePackResult)) {
op->emitOpError("failed to pack the operation");
return failure();
}

linalg::PackResult packResult = maybePackResult.value();
return packResult;
}

Expand All @@ -60,7 +64,8 @@ void AMDAIEPackAndTransposePass::runOnOperation() {
// Find the linalg op for packing, currently only consider contraction ops
linalg::LinalgOp linalgOp;
funcOp->walk([&](linalg::LinalgOp op) {
if (linalg::isaContractionOpInterface(op)) {
if (linalg::isaContractionOpInterface(op) ||
isa<linalg::ConvolutionOpInterface>(op.getOperation())) {
linalgOp = op;
return WalkResult::interrupt();
}
Expand All @@ -75,6 +80,7 @@ void AMDAIEPackAndTransposePass::runOnOperation() {
// Step 1. Before packing the operation, we will prefetch the lowering and
// packing config.
auto config = getLoweringConfig<IREE::Codegen::LoweringConfigAttr>(linalgOp);

auto packingConfig = getPackingConfig(linalgOp);

if (!config || !packingConfig) {
Expand All @@ -87,6 +93,12 @@ void AMDAIEPackAndTransposePass::runOnOperation() {
// Extract packing config from the `linalgOp`.
PackingConfigPackingLevelAttr packCfg =
packingConfig.getPackingConfigVals(packLevel);

if (!packCfg) {
funcOp->emitOpError("failed to get pack config for pack level ")
<< packLevel;
return signalPassFailure();
}
SmallVector<OpFoldResult> packedSizes =
getAsIndexOpFoldResult(context, packCfg.getPackedSizes());

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -472,34 +472,80 @@ static LogicalResult setRootConfigForPadPackPipeline(

static LogicalResult setRootConfigForConvDecomposePipeline(
mlir::FunctionOpInterface entryPointFn, linalg::LinalgOp linalgOp) {
MLIRContext *context = entryPointFn.getContext();

FailureOr<std::array<uint32_t, 3>> maybeInstructionSize =
getMatmulInstructionSize(linalgOp);
int64_t OW = 4;
int64_t OC = 4;
int64_t IC = 8;
if (succeeded(maybeInstructionSize)) {
auto instructionSize = maybeInstructionSize.value();
OW = instructionSize[0];
OC = instructionSize[1];
IC = instructionSize[2];
auto [m, n, k] = maybeInstructionSize.value();
OW = m;
OC = n;
IC = k;
}

SmallVector<int64_t> transposePackIndices{0, 1, 2};
SmallVector<bool> unpackEmpty{false, false, true};

// Convolution type specific vectors:
SmallVector<SmallVector<int64_t>> innerPerm;
SmallVector<SmallVector<int64_t>> outerPerm;
SmallVector<int64_t> tileSizeLevel0;
SmallVector<int64_t> tileSizeLevel1;
SmallVector<int64_t> tileSizeLevel2;
// Note: some of the tiling dimensions are hardcoded for now.
if (isa<linalg::Conv2DNhwcHwcfOp>(linalgOp) ||
isa<linalg::Conv2DNhwcHwcfQOp>(linalgOp)) {
// conv_2d_nhwc_hwcf tiling dims: [N, OH, OW, OC, KH, KW, IC].
tileSizeLevel0 = {0, 4, OW, OC, 0, 0, 0};
SmallVector<int64_t> packingSizes;

// [N, OH, OW, OC, KH, KW, IC].
if (isa<linalg::Conv2DNhwcHwcfQOp>(linalgOp) ||
isa<linalg::Conv2DNhwcHwcfOp>(linalgOp)) {
// The goal is to pack the input image and kernel as follows, when moving
// from L2 to L11:
//
// Example where input has 32 channels:
//
// %alloc_8 = memref.alloc() : memref<1x3x4x6x8xbf16, 2 : i32>
// iree_linalg_ext.pack %subview_5 outer_dims_perm = [0, 1, 3, 2]
// inner_dims_pos = [3]
// inner_tiles = [8] into %alloc_8 :
// (memref<1x3x6x32xbf16, strided<[576, 192, 32, 1], offset: ?>, 1 : i32>
// memref<1x3x4x6x8xbf16, 2 : i32>)
//
// %alloc_9 = memref.alloc() : memref<3x3x4x1x8x4xbf16, 2 : i32>
// iree_linalg_ext.pack %subview_6 outer_dims_perm = [0, 1, 2, 3]
// inner_dims_pos = [2, 3]
// inner_tiles = [8, 4] into %alloc_9 :
// (memref<3x3x32x4xbf16, strided<[384, 128, 4, 1], offset: ?>, 1 : i32>
// memref<3x3x4x1x8x4xbf16, 2 : i32>)
innerPerm = {{}, {{1, 0}}, {}};
outerPerm = {{0, 1, 3, 2}, {}, {0, 1, 2, 3}};
packingSizes = {0, 0, 0, OC, 0, 0, IC};
tileSizeLevel1 = {1, 1, OW, OC, 0, 0, 0};
tileSizeLevel2 = {0, 0, 0, 0, 1, 1, IC};
} else if (isa<linalg::Conv2DNchwFchwOp>(linalgOp)) {
// conv_2d_nchw_fchw tiling dims: [N, OC, OH, OW, IC, KH, KW].
tileSizeLevel0 = {0, OC, 4, OW, 0, 0, 0};
tileSizeLevel1 = {1, OC, 1, OW, 0, 0, 0};
tileSizeLevel2 = {0, 0, 0, 0, IC, 1, 1};
} else if (isa<linalg::DepthwiseConv2DNhwcHwcOp>(linalgOp)) {
// convert the kernel height, kernel width, and outer IC reduction into
// scf.for loops, leaving just a matmul of the instruction size inside
// the loops.
tileSizeLevel2 = {0, 0, 0, 0, 1, 1, 1, 0, 0};
}

// [N, OC, OH, OW, IC, KH, KW]
else if (isa<linalg::Conv2DNchwFchwOp>(linalgOp)) {
// To have an inner-loop matmul, 'IC' must be in innermost dimension.
// To permute dimensions of an image in CHW to get channel last requires
// that H*W is a nice power of 2 if it's to be done in DMA. For kernels,
// it requires h*w is a nice power of 2 (even less likely to be the case as
// we typically have h=w=3). The dimension permutations will often
// therefore need to be done on the core.
//
// We are leaving this for future work, the expectation for now is that
// models have been transformed at a high level to avoid channel-first
// convolutions.
return linalgOp.emitError(
"Only channel-last convolution supported currently.");
}

// [N, OH, OW, C, KW, HW]
else if (isa<linalg::DepthwiseConv2DNhwcHwcOp>(linalgOp)) {
// Notes:
// =====
//
Expand All @@ -512,14 +558,10 @@ static LogicalResult setRootConfigForConvDecomposePipeline(
// no checks that the data tiles are not too large, or that the input
// dimensions are perfectly tiled by the hard-coded tile dimensions below.
// These will be done as a follow-up task.
//
//
// Below we target a 4x4 array of AIE cores.
auto getElementType = [](Value v) {
return cast<ShapedType>(v.getType()).getElementType();
};
const uint16_t OW_0 = 4;
const uint16_t OH_0 = 4;
const uint16_t OH_1 = 1;

auto operandType = getElementType(linalgOp->getOperand(0));
Expand All @@ -530,26 +572,56 @@ static LogicalResult setRootConfigForConvDecomposePipeline(
OC_0 = maybeMacNumElements.value();
}
// If the operand type has fewer than 32-bits, we really should be able to
// get a mac-width for it Bail because we didn't, and there's probably just
// something missing in the table.
// get a mac-width for it. Bail because we didn't, there's probably just
// something missing in a table.
else if (operandType.getIntOrFloatBitWidth() < 32) {
return linalgOp.emitError(
"has an operand type with fewer than 32-bits, but no mac-width "
"could be determined.");
}

const uint16_t OC_1 = OC_0 / 4;

// depthwise_conv2d_nhwc_hwc tiling dims:
// [N, OH, OW, OC, KH,KW]
tileSizeLevel0 = {1, OH_0, OW_0, OC_0, 0, 0};
packingSizes = {0, 0, 0, OC_1, 0, 0};
innerPerm = {{}, {}, {}};
outerPerm = {{0, 1, 2, 3}, {0, 1, 2}, {0, 1, 2, 3}};
tileSizeLevel1 = {1, OH_1, OW_0, OC_1, 0, 0};
tileSizeLevel2 = {0, 0, 0, 0, 1, 1};
} else {
assert(false && "Support must be added for this convolution op");
tileSizeLevel2 = {0, 0, 0, 0, 1, 1, 0};
}

else {
assert(false &&
"unrecognised convolution op, cannot set packing config, support "
"must be added.");
}

// For the objectFifo backend we currently target a single core.
// Next step: increase number of cores targeted, figure out what
//
// ```
// error: 'aie.memtile_dma' op could not find and assign a valid BD id
// ```
//
// means. TODO(newling) create task to track this.
tileSizeLevel0 = tileSizeLevel1;

assert(!innerPerm.empty() && !outerPerm.empty() && !packingSizes.empty() &&
!tileSizeLevel0.empty() && !tileSizeLevel1.empty() &&
"not all vectors for initializing config are non-empty");

auto packingConfigLevel1Attr = getPackingConfigPackingLevelAttr(
context, packingSizes, transposePackIndices, unpackEmpty, innerPerm,
outerPerm);
SmallVector<PackingConfigPackingLevelAttr> packingConfigLevelsVal{
packingConfigLevel1Attr};

auto packingConfigLevels =
PackingConfigPackingLevelsAttr::get(context, packingConfigLevelsVal);
auto config = PackingConfigAttr::get(context, packingConfigLevels);
setPackingConfig(linalgOp, config);

TileSizesListType tileSizes = {tileSizeLevel0, tileSizeLevel1,
tileSizeLevel2};

return setOpConfigAndEntryPointFnTranslation(
entryPointFn, linalgOp, tileSizes,
IREE::Codegen::DispatchLoweringPassPipeline::Custom);
Expand Down
Loading

0 comments on commit b9f4242

Please sign in to comment.