Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Convolution] Packing + objectFifo, initial support #789

Merged
merged 5 commits into from
Oct 9, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions build_tools/ci/cpu_comparison/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -641,7 +641,7 @@ def run(self, config):
config,
test_name,
tile_pipeline="conv-decompose",
lower_to_aie_pipeline="air",
lower_to_aie_pipeline="objectFifo",
n_repeats=n_conv_repeats,
)

Expand All @@ -661,7 +661,7 @@ def run(self, config):
config,
test_files_dir / f"{name}.mlir",
tile_pipeline="conv-decompose",
lower_to_aie_pipeline="air",
lower_to_aie_pipeline="objectFifo",
newling marked this conversation as resolved.
Show resolved Hide resolved
n_repeats=n_conv_repeats,
)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ iree_lit_test_suite(
NAME
lit
SRCS
"conv2d_nhwc_air_e2e.mlir"
"conv2d_nhwc_objectfifo_e2e.mlir"
"matmul_elementwise_pack_peel_air_e2e.mlir"
"matmul_pack_peel_air_e2e.mlir"
"matmul_pack_peel_objectfifo.mlir"
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// RUN: iree-compile --iree-hal-target-backends=amd-aie --compile-to=executable-sources %s | iree-opt --pass-pipeline="builtin.module(hal.executable(hal.executable.variant(iree-hal-translate-target-executable-variants{target=amd-aie})))" --iree-amdaie-tile-pipeline=conv-decompose --iree-amdaie-lower-to-aie-pipeline=air --split-input-file | FileCheck %s
// RUN: iree-compile --iree-hal-target-backends=amd-aie --compile-to=executable-sources %s | iree-opt --pass-pipeline="builtin.module(hal.executable(hal.executable.variant(iree-hal-translate-target-executable-variants{target=amd-aie})))" --iree-amdaie-tile-pipeline=conv-decompose --iree-amdaie-lower-to-aie-pipeline=objectFifo --split-input-file | FileCheck %s

func.func @conv_2d_nhwc_hwcf(%arg0: tensor<2x14x14x32xi32>, %arg1: tensor<3x3x32x64xi32>) -> tensor<2x12x12x64xi32> {
%cst = arith.constant 0 : i32
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@ namespace {
static FailureOr<linalg::PackResult> applyPackOnLinalgOp(
RewriterBase &rewriter, linalg::LinalgOp op,
SmallVector<OpFoldResult> packedSizes) {
// Fail on mismatched number of pack sizes.
if (packedSizes.size() != op.getNumLoops()) {
op->emitOpError(
"requires number of packed sizes match the number of loops (")
Expand All @@ -29,12 +28,14 @@ static FailureOr<linalg::PackResult> applyPackOnLinalgOp(
}

rewriter.setInsertionPoint(op);
FailureOr<linalg::PackResult> packResult =
FailureOr<linalg::PackResult> maybePackResult =
linalg::pack(rewriter, op, packedSizes);
if (failed(packResult)) {
if (failed(maybePackResult)) {
op->emitOpError("failed to pack the operation");
return failure();
}

linalg::PackResult packResult = maybePackResult.value();
return packResult;
}

Expand All @@ -60,7 +61,8 @@ void AMDAIEPackAndTransposePass::runOnOperation() {
// Find the linalg op for packing, currently only consider contraction ops
linalg::LinalgOp linalgOp;
funcOp->walk([&](linalg::LinalgOp op) {
if (linalg::isaContractionOpInterface(op)) {
if (linalg::isaContractionOpInterface(op) ||
isa<linalg::ConvolutionOpInterface>(op.getOperation())) {
linalgOp = op;
return WalkResult::interrupt();
}
Expand All @@ -75,6 +77,7 @@ void AMDAIEPackAndTransposePass::runOnOperation() {
// Step 1. Before packing the operation, we will prefetch the lowering and
// packing config.
auto config = getLoweringConfig<IREE::Codegen::LoweringConfigAttr>(linalgOp);

auto packingConfig = getPackingConfig(linalgOp);

if (!config || !packingConfig) {
Expand All @@ -87,6 +90,12 @@ void AMDAIEPackAndTransposePass::runOnOperation() {
// Extract packing config from the `linalgOp`.
PackingConfigPackingLevelAttr packCfg =
packingConfig.getPackingConfigVals(packLevel);

if (!packCfg) {
funcOp->emitOpError("failed to get pack config for pack level ")
<< packLevel;
return signalPassFailure();
}
SmallVector<OpFoldResult> packedSizes =
getAsIndexOpFoldResult(context, packCfg.getPackedSizes());

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -472,54 +472,83 @@ static LogicalResult setRootConfigForPadPackPipeline(

static LogicalResult setRootConfigForConvDecomposePipeline(
mlir::FunctionOpInterface entryPointFn, linalg::LinalgOp linalgOp) {
MLIRContext *context = entryPointFn.getContext();

FailureOr<std::array<uint32_t, 3>> maybeInstructionSize =
getMatmulInstructionSize(linalgOp);
int64_t OW = 4;
int64_t OC = 4;
int64_t IC = 8;
if (succeeded(maybeInstructionSize)) {
auto instructionSize = maybeInstructionSize.value();
OW = instructionSize[0];
OC = instructionSize[1];
IC = instructionSize[2];
auto [m, n, k] = maybeInstructionSize.value();
OW = m;
OC = n;
IC = k;
}

SmallVector<int64_t> transposePackIndices{0, 1, 2};
SmallVector<bool> unpackEmpty{false, false, true};

// Convolution type specific vectors:
SmallVector<SmallVector<int64_t>> innerPerm;
SmallVector<SmallVector<int64_t>> outerPerm;
SmallVector<int64_t> tileSizeLevel0;
SmallVector<int64_t> tileSizeLevel1;
SmallVector<int64_t> tileSizeLevel2;
// Note: some of the tiling dimensions are hardcoded for now.
if (isa<linalg::Conv2DNhwcHwcfOp>(linalgOp) ||
isa<linalg::Conv2DNhwcHwcfQOp>(linalgOp)) {
// conv_2d_nhwc_hwcf tiling dims: [N, OH, OW, OC, KH, KW, IC].
tileSizeLevel0 = {0, 4, OW, OC, 0, 0, 0};
SmallVector<int64_t> packingSizes;

// [N, OH, OW, OC, KH, KW, IC].
if (isa<linalg::Conv2DNhwcHwcfQOp>(linalgOp) ||
isa<linalg::Conv2DNhwcHwcfOp>(linalgOp)) {
// The goal is to pack the input image and kernel as follows, when moving
// from L2 to L1 (example where there are 32 input channels):
// Image: memref<1x3x6x32xbf16> -> memref<1x3x4x6x8xbf16>
// Kernel: memref<3x3x32x4xbf16> -> memref<3x3x4x1x8x4xbf16>
innerPerm = {{}, {{1, 0}}, {}};
outerPerm = {{0, 1, 3, 2}, {}, {0, 1, 2, 3}};
newling marked this conversation as resolved.
Show resolved Hide resolved
packingSizes = {0, 0, 0, OC, 0, 0, IC};
// Target one column of 4 cores, each core processing a different
// output image row. TODO(newling) use 4x4 array.
// https://github.com/nod-ai/iree-amd-aie/issues/821
tileSizeLevel0 = {1, 4, OW, OC, 0, 0, 0};
tileSizeLevel1 = {1, 1, OW, OC, 0, 0, 0};
tileSizeLevel2 = {0, 0, 0, 0, 1, 1, IC};
} else if (isa<linalg::Conv2DNchwFchwOp>(linalgOp)) {
// conv_2d_nchw_fchw tiling dims: [N, OC, OH, OW, IC, KH, KW].
tileSizeLevel0 = {0, OC, 4, OW, 0, 0, 0};
tileSizeLevel1 = {1, OC, 1, OW, 0, 0, 0};
tileSizeLevel2 = {0, 0, 0, 0, IC, 1, 1};
} else if (isa<linalg::DepthwiseConv2DNhwcHwcOp>(linalgOp)) {
// Notes:
// scf.for tiling of KH, KW, and (packed) IC dimensions:
tileSizeLevel2 = {0, 0, 0, 0, 1, 1, 1, 0, 0};
}

// [N, OC, OH, OW, IC, KH, KW]
newling marked this conversation as resolved.
Show resolved Hide resolved
else if (isa<linalg::Conv2DNchwFchwOp>(linalgOp)) {
// The matmul reduction dimension is the input channel (IC) dimension.
// For Conv2DNhwcHwcfOp, this dimension is already the inner-most dimension
// of the input image, and the penultimate dimension of the kernel --
// exactly what we want. For Conv2DNchwFchwOp, can the tensor dimensions be
// permuted in DMA to get them in the correct positions? For the image
// tensor, only if H*W is a nice power of 2 (DMA constraint). For kernels,
// it requires h*w is a nice power of 2 -- unlikely, we typically have
// h=w=3. The dimension permutations will therefore often therefore need to
// be done on the core. We leave this for future work, the expectation for
// now is that models have been transformed at a high level to avoid
// channel-first convolutions.
return linalgOp.emitError(
"Only channel-last convolution supported currently.");
}

// [N, OH, OW, C, KW, HW]
else if (isa<linalg::DepthwiseConv2DNhwcHwcOp>(linalgOp)) {
// Notes
// =====
//
// An inherent property of depthwise convolutions is that they cannot be
// expressed in terms of matmuls, unlike the above (dense) conv-2ds. The
// tile sizes we choose below are therefore not constrained by the AIE
// matmul instructions.
// A property of depthwise convolution is that it can't be expressed in
// terms of matmul, unlike the above (dense) conv-2ds. The tile sizes we
// choose below are therefore not constrained by AIE matmul instructions.
//
// The logic is currently fragile, and there are no guardrails: there are
// no checks that the data tiles are not too large, or that the input
// dimensions are perfectly tiled by the hard-coded tile dimensions below.
// These will be done as a follow-up task.
//
//
// Below we target a 4x4 array of AIE cores.
auto getElementType = [](Value v) {
return cast<ShapedType>(v.getType()).getElementType();
};
const uint16_t OW_0 = 4;
const uint16_t OH_0 = 4;
const uint16_t OH_1 = 1;

auto operandType = getElementType(linalgOp->getOperand(0));
Expand All @@ -530,26 +559,49 @@ static LogicalResult setRootConfigForConvDecomposePipeline(
OC_0 = maybeMacNumElements.value();
}
// If the operand type has fewer than 32-bits, we really should be able to
// get a mac-width for it Bail because we didn't, and there's probably just
// something missing in the table.
// get a mac-width for it. Bail because we didn't, there's probably just
// something missing in a table.
else if (operandType.getIntOrFloatBitWidth() < 32) {
return linalgOp.emitError(
"has an operand type with fewer than 32-bits, but no mac-width "
"could be determined.");
}

const uint16_t OC_1 = OC_0 / 4;

// depthwise_conv2d_nhwc_hwc tiling dims:
// [N, OH, OW, OC, KH,KW]
tileSizeLevel0 = {1, OH_0, OW_0, OC_0, 0, 0};
packingSizes = {0, 0, 0, OC_1, 0, 0};
innerPerm = {{}, {}, {}};
outerPerm = {{0, 1, 2, 3}, {0, 1, 2}, {0, 1, 2, 3}};
// Target one column of 4 cores, each core processing a different
// output image row. TODO(newling) use 4x4 array.
// https://github.com/nod-ai/iree-amd-aie/issues/821
tileSizeLevel0 = {1, 4 * OH_1, OW_0, OC_1, 0, 0};
tileSizeLevel1 = {1, OH_1, OW_0, OC_1, 0, 0};
tileSizeLevel2 = {0, 0, 0, 0, 1, 1};
} else {
assert(false && "Support must be added for this convolution op");
tileSizeLevel2 = {0, 0, 0, 0, 1, 1, 0};
}

else {
return linalgOp.emitError(
"unrecognised convolution op, cannot set packing config. ");
}

assert(!innerPerm.empty() && !outerPerm.empty() && !packingSizes.empty() &&
!tileSizeLevel0.empty() && !tileSizeLevel1.empty() &&
"not all vectors for initializing config are non-empty");

auto packingConfigLevel1Attr = getPackingConfigPackingLevelAttr(
context, packingSizes, transposePackIndices, unpackEmpty, innerPerm,
outerPerm);
SmallVector<PackingConfigPackingLevelAttr> packingConfigLevelsVal{
packingConfigLevel1Attr};

auto packingConfigLevels =
PackingConfigPackingLevelsAttr::get(context, packingConfigLevelsVal);
auto config = PackingConfigAttr::get(context, packingConfigLevels);
setPackingConfig(linalgOp, config);

TileSizesListType tileSizes = {tileSizeLevel0, tileSizeLevel1,
tileSizeLevel2};

return setOpConfigAndEntryPointFnTranslation(
entryPointFn, linalgOp, tileSizes,
IREE::Codegen::DispatchLoweringPassPipeline::Custom);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -408,16 +408,20 @@ void addConvDecomposePassPipeline(OpPassManager &funcPassManager,
TilingConfig &tilingConfig,
bool enableVectorizationPasses,
TilePassPipeline useTilePipeline) {
auto addCleanups = [&]() {
funcPassManager.addPass(createAMDAIECleanupPass());
funcPassManager.addPass(createCanonicalizerPass());
funcPassManager.addPass(createCSEPass());
};

// First level tiling using scf.forall
{
AMDAIETileAndFuseOptions tileFuseOptions;
tileFuseOptions.tilingLevel = 0;
tileFuseOptions.useSCFFor = false;
funcPassManager.addPass(createAMDAIETileAndFusePass(tileFuseOptions));
addCleanups();
}
funcPassManager.addPass(createAMDAIECleanupPass());
funcPassManager.addPass(createCanonicalizerPass());
funcPassManager.addPass(createCSEPass());

// Pad the linalg operation
{
Expand All @@ -441,67 +445,50 @@ void addConvDecomposePassPipeline(OpPassManager &funcPassManager,
tileFuseOptions.tilingLevel = 1;
tileFuseOptions.useSCFFor = false;
funcPassManager.addPass(createAMDAIETileAndFusePass(tileFuseOptions));
addCleanups();
}
funcPassManager.addPass(createAMDAIECleanupPass());
funcPassManager.addPass(createCanonicalizerPass());
funcPassManager.addPass(createCSEPass());

// Fuse fill op into the inner forall loop
funcPassManager.addPass(createAMDAIEFuseFillIntoForallPass());
funcPassManager.addPass(createCanonicalizerPass());

// Pad the linalg operation
// Pack the linalg operation
{
AMDAIEPadOptions padOptions;
padOptions.paddingLevel = 1;
funcPassManager.addPass(createAMDAIEPadPass(padOptions));
AMDAIEPackAndTransposeOptions packOptions;
packOptions.packLevel = 0;
funcPassManager.addPass(createAMDAIEPackAndTransposePass(packOptions));
}

// Only promote the result to local memory
// Promote the inputs and results to local memory
{
AMDAIEBufferizeToAllocationOptions bufferizeOptions;
bufferizeOptions.memorySpace = 2;
bufferizeOptions.bufferizeOperand = BufferizeOperand::Output;
bufferizeOptions.bufferizeOperand = BufferizeOperand::InputOutput;
funcPassManager.addPass(
createAMDAIEBufferizeToAllocationPass(bufferizeOptions));
addCleanups();
}

// Tile the reduction dimension using scf.for
{
AMDAIETileAndFuseOptions tileFuseOptions;
tileFuseOptions.tilingLevel = 2;
tileFuseOptions.useSCFFor = true;
funcPassManager.addPass(createAMDAIETileAndFusePass(tileFuseOptions));
}
funcPassManager.addPass(createAMDAIECleanupPass());
funcPassManager.addPass(createCanonicalizerPass());
funcPassManager.addPass(createCSEPass());

// Pad the linalg operation
{
AMDAIEPadOptions padOptions;
padOptions.paddingLevel = 2;
funcPassManager.addPass(createAMDAIEPadPass(padOptions));
}

// Promote the inputs to local memory
{
AMDAIEBufferizeToAllocationOptions bufferizeOptions;
bufferizeOptions.memorySpace = 2;
bufferizeOptions.bufferizeOperand = BufferizeOperand::Input;
funcPassManager.addPass(
createAMDAIEBufferizeToAllocationPass(bufferizeOptions));
addCleanups();
}

// Decompose Conv2d ops to Conv1d ops
funcPassManager.addPass(createDecomposeConvolutionToLowerDimOpsPass());
LinalgFoldUnitExtentDimsPassOptions opts;
opts.useRankReducingSlices = true;
funcPassManager.addPass(mlir::createLinalgFoldUnitExtentDimsPass(opts));

// Vectorization passes
// FIXME(newling) https://github.com/nod-ai/iree-amd-aie/issues/820
enableVectorizationPasses = false;
appendVectorizationToPipeline(funcPassManager, enableVectorizationPasses);
funcPassManager.addPass(createCanonicalizerPass());

// Comprehensive bufferization
addAMDAIEBufferizePasses(funcPassManager, useTilePipeline);
funcPassManager.addPass(createHoistStaticallyBoundAllocationsPass());
}

void buildAMDAIETransformPassPipeline(
Expand Down Expand Up @@ -557,6 +544,7 @@ void addAMDAIEObjectFifoLoweringPasses(OpPassManager &passManager,
bool enablePacketFlow) {
passManager.addPass(createEraseHALDescriptorTypeFromMemRefPass());
passManager.addPass(memref::createFoldMemRefAliasOpsPass());
passManager.addPass(createCanonicalizerPass());
passManager.addPass(createAMDAIEConvertToDmaPass());

passManager.addPass(createAMDAIENormalizeLoopBoundsPass());
Expand All @@ -582,6 +570,7 @@ void addAMDAIEObjectFifoLoweringPasses(OpPassManager &passManager,
passManager.addPass(createAMDAIEAssignLogicalObjectFifoDepthPass());
passManager.addPass(createAMDAIEAccessToAcquireReleasePass());
passManager.addPass(createAMDAIENoneAccessToTemporaryBufferPass());

passManager.addPass(
createAMDAIEAssignConnectionTypesPass({enablePacketFlow}));
passManager.addPass(createCSEPass());
Expand Down Expand Up @@ -612,6 +601,7 @@ void addAMDAIEObjectFifoLoweringPasses(OpPassManager &passManager,
passManager.addPass(createCanonicalizerPass());

passManager.addPass(createAMDAIEObjFifoBufferizationPass());
passManager.addPass(createAMDAIETemporaryAllocBufferizationPass());
newling marked this conversation as resolved.
Show resolved Hide resolved
passManager.addPass(createAMDAIEConnectionToFlowPass());
passManager.addPass(createAMDAIEAssignPacketIdsPass());

Expand Down
Loading
Loading