squashed

nod-ai · Oct 2, 2024 · b9f4242 · b9f4242
1 parent a88304a
commit b9f4242
Show file tree

Hide file tree

Showing 7 changed files with 167 additions and 103 deletions.
diff --git a/build_tools/ci/cpu_comparison/run.py b/build_tools/ci/cpu_comparison/run.py
@@ -662,7 +662,7 @@ def run(self, config):
                 config,
                 test_name,
                 tile_pipeline="conv-decompose",
-                lower_to_aie_pipeline="air",
+                lower_to_aie_pipeline="objectFifo",
                 n_repeats=n_conv_repeats,
             )
 
@@ -682,7 +682,7 @@ def run(self, config):
                 config,
                 test_files_dir / f"{name}.mlir",
                 tile_pipeline="conv-decompose",
-                lower_to_aie_pipeline="air",
+                lower_to_aie_pipeline="objectFifo",
                 n_repeats=n_conv_repeats,
             )
 

diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Test/samples/CMakeLists.txt b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Test/samples/CMakeLists.txt
@@ -8,7 +8,7 @@ iree_lit_test_suite(
   NAME
     lit
   SRCS
-    "conv2d_nhwc_air_e2e.mlir"
+    "conv2d_nhwc_objectfifo_e2e.mlir"
     "matmul_elementwise_pack_peel_air_e2e.mlir"
     "matmul_pack_peel_air_e2e.mlir"
     "matmul_pack_peel_objectfifo.mlir"

diff --git a/...aie/Test/samples/conv2d_nhwc_air_e2e.mlir → ...t/samples/conv2d_nhwc_objectfifo_e2e.mlir b/...aie/Test/samples/conv2d_nhwc_air_e2e.mlir → ...t/samples/conv2d_nhwc_objectfifo_e2e.mlir
@@ -1,4 +1,4 @@
-// RUN: iree-compile --iree-hal-target-backends=amd-aie --compile-to=executable-sources %s | iree-opt --pass-pipeline="builtin.module(hal.executable(hal.executable.variant(iree-hal-translate-target-executable-variants{target=amd-aie})))" --iree-amdaie-tile-pipeline=conv-decompose --iree-amdaie-lower-to-aie-pipeline=air --split-input-file | FileCheck %s
+// RUN: iree-compile --iree-hal-target-backends=amd-aie --compile-to=executable-sources %s | iree-opt --pass-pipeline="builtin.module(hal.executable(hal.executable.variant(iree-hal-translate-target-executable-variants{target=amd-aie})))" --iree-amdaie-tile-pipeline=conv-decompose --iree-amdaie-lower-to-aie-pipeline=objectFifo --split-input-file | FileCheck %s
 
 func.func @conv_2d_nhwc_hwcf(%arg0: tensor<2x14x14x32xi32>, %arg1: tensor<3x3x32x64xi32>) -> tensor<2x12x12x64xi32> {
   %cst = arith.constant 0 : i32

diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEPackAndTranspose.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEPackAndTranspose.cpp
@@ -7,8 +7,11 @@
 #include "iree-amd-aie/IR/AMDAIEAttrs.h"
 #include "iree-amd-aie/Transforms/Passes.h"
 #include "iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenAttrs.h"
+#include "mlir/Dialect/Linalg/IR/LinalgInterfaces.h"
 #include "mlir/Dialect/Linalg/Transforms/Transforms.h"
+#include "mlir/Dialect/Tensor/IR/Tensor.h"
 #include "mlir/Dialect/Utils/StaticValueUtils.h"
+#include "mlir/IR/Diagnostics.h"
 #include "mlir/Pass/Pass.h"
 
 #define DEBUG_TYPE "iree-amdaie-pack-and-transpose"
@@ -20,7 +23,6 @@ namespace {
 static FailureOr<linalg::PackResult> applyPackOnLinalgOp(
     RewriterBase &rewriter, linalg::LinalgOp op,
     SmallVector<OpFoldResult> packedSizes) {
-  // Fail on mismatched number of pack sizes.
   if (packedSizes.size() != op.getNumLoops()) {
     op->emitOpError(
         "requires number of packed sizes match the number of loops (")
@@ -29,12 +31,14 @@ static FailureOr<linalg::PackResult> applyPackOnLinalgOp(
   }
 
   rewriter.setInsertionPoint(op);
-  FailureOr<linalg::PackResult> packResult =
+  FailureOr<linalg::PackResult> maybePackResult =
       linalg::pack(rewriter, op, packedSizes);
-  if (failed(packResult)) {
+  if (failed(maybePackResult)) {
     op->emitOpError("failed to pack the operation");
     return failure();
   }
+
+  linalg::PackResult packResult = maybePackResult.value();
   return packResult;
 }
 
@@ -60,7 +64,8 @@ void AMDAIEPackAndTransposePass::runOnOperation() {
   // Find the linalg op for packing, currently only consider contraction ops
   linalg::LinalgOp linalgOp;
   funcOp->walk([&](linalg::LinalgOp op) {
-    if (linalg::isaContractionOpInterface(op)) {
+    if (linalg::isaContractionOpInterface(op) ||
+        isa<linalg::ConvolutionOpInterface>(op.getOperation())) {
       linalgOp = op;
       return WalkResult::interrupt();
     }
@@ -75,6 +80,7 @@ void AMDAIEPackAndTransposePass::runOnOperation() {
   // Step 1. Before packing the operation, we will prefetch the lowering and
   // packing config.
   auto config = getLoweringConfig<IREE::Codegen::LoweringConfigAttr>(linalgOp);
+
   auto packingConfig = getPackingConfig(linalgOp);
 
   if (!config || !packingConfig) {
@@ -87,6 +93,12 @@ void AMDAIEPackAndTransposePass::runOnOperation() {
   // Extract packing config from the `linalgOp`.
   PackingConfigPackingLevelAttr packCfg =
       packingConfig.getPackingConfigVals(packLevel);
+
+  if (!packCfg) {
+    funcOp->emitOpError("failed to get pack config for pack level ")
+        << packLevel;
+    return signalPassFailure();
+  }
   SmallVector<OpFoldResult> packedSizes =
       getAsIndexOpFoldResult(context, packCfg.getPackedSizes());
 

diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/KernelDispatch.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/KernelDispatch.cpp
@@ -472,34 +472,80 @@ static LogicalResult setRootConfigForPadPackPipeline(
 
 static LogicalResult setRootConfigForConvDecomposePipeline(
     mlir::FunctionOpInterface entryPointFn, linalg::LinalgOp linalgOp) {
+  MLIRContext *context = entryPointFn.getContext();
+
   FailureOr<std::array<uint32_t, 3>> maybeInstructionSize =
       getMatmulInstructionSize(linalgOp);
   int64_t OW = 4;
   int64_t OC = 4;
   int64_t IC = 8;
   if (succeeded(maybeInstructionSize)) {
-    auto instructionSize = maybeInstructionSize.value();
-    OW = instructionSize[0];
-    OC = instructionSize[1];
-    IC = instructionSize[2];
+    auto [m, n, k] = maybeInstructionSize.value();
+    OW = m;
+    OC = n;
+    IC = k;
   }
 
+  SmallVector<int64_t> transposePackIndices{0, 1, 2};
+  SmallVector<bool> unpackEmpty{false, false, true};
+
+  // Convolution type specific vectors:
+  SmallVector<SmallVector<int64_t>> innerPerm;
+  SmallVector<SmallVector<int64_t>> outerPerm;
   SmallVector<int64_t> tileSizeLevel0;
   SmallVector<int64_t> tileSizeLevel1;
   SmallVector<int64_t> tileSizeLevel2;
-  // Note: some of the tiling dimensions are hardcoded for now.
-  if (isa<linalg::Conv2DNhwcHwcfOp>(linalgOp) ||
-      isa<linalg::Conv2DNhwcHwcfQOp>(linalgOp)) {
-    // conv_2d_nhwc_hwcf tiling dims: [N, OH, OW, OC, KH, KW, IC].
-    tileSizeLevel0 = {0, 4, OW, OC, 0, 0, 0};
+  SmallVector<int64_t> packingSizes;
+
+  // [N, OH, OW, OC, KH, KW, IC].
+  if (isa<linalg::Conv2DNhwcHwcfQOp>(linalgOp) ||
+      isa<linalg::Conv2DNhwcHwcfOp>(linalgOp)) {
+    // The goal is to pack the input image and kernel as follows, when moving
+    // from L2 to L11:
+    //
+    //  Example where input has 32 channels:
+    //
+    //  %alloc_8 = memref.alloc() : memref<1x3x4x6x8xbf16, 2 : i32>
+    //  iree_linalg_ext.pack %subview_5 outer_dims_perm = [0, 1, 3, 2]
+    //                                  inner_dims_pos = [3]
+    //                                  inner_tiles = [8] into %alloc_8 :
+    //  (memref<1x3x6x32xbf16, strided<[576, 192, 32, 1], offset: ?>, 1 : i32>
+    //   memref<1x3x4x6x8xbf16, 2 : i32>)
+    //
+    //  %alloc_9 = memref.alloc() : memref<3x3x4x1x8x4xbf16, 2 : i32>
+    //  iree_linalg_ext.pack %subview_6 outer_dims_perm = [0, 1, 2, 3]
+    //                                  inner_dims_pos = [2, 3]
+    //                                  inner_tiles = [8, 4] into %alloc_9 :
+    //  (memref<3x3x32x4xbf16, strided<[384, 128, 4, 1], offset: ?>, 1 : i32>
+    //   memref<3x3x4x1x8x4xbf16, 2 : i32>)
+    innerPerm = {{}, {{1, 0}}, {}};
+    outerPerm = {{0, 1, 3, 2}, {}, {0, 1, 2, 3}};
+    packingSizes = {0, 0, 0, OC, 0, 0, IC};
     tileSizeLevel1 = {1, 1, OW, OC, 0, 0, 0};
-    tileSizeLevel2 = {0, 0, 0, 0, 1, 1, IC};
-  } else if (isa<linalg::Conv2DNchwFchwOp>(linalgOp)) {
-    // conv_2d_nchw_fchw tiling dims: [N, OC, OH, OW, IC, KH, KW].
-    tileSizeLevel0 = {0, OC, 4, OW, 0, 0, 0};
-    tileSizeLevel1 = {1, OC, 1, OW, 0, 0, 0};
-    tileSizeLevel2 = {0, 0, 0, 0, IC, 1, 1};
-  } else if (isa<linalg::DepthwiseConv2DNhwcHwcOp>(linalgOp)) {
+    // convert the kernel height, kernel width, and outer IC reduction into
+    // scf.for loops, leaving just a matmul of the instruction size inside
+    // the loops.
+    tileSizeLevel2 = {0, 0, 0, 0, 1, 1, 1, 0, 0};
+  }
+
+  // [N, OC, OH, OW, IC, KH, KW]
+  else if (isa<linalg::Conv2DNchwFchwOp>(linalgOp)) {
+    // To have an inner-loop matmul, 'IC' must be in innermost dimension.
+    // To permute dimensions of an image in CHW to get channel last requires
+    // that H*W is a nice power of 2 if it's to be done in DMA. For kernels,
+    // it requires h*w is a nice power of 2 (even less likely to be the case as
+    // we typically have h=w=3). The dimension permutations will often
+    // therefore need to be done on the core.
+    //
+    // We are leaving this for future work, the expectation for now is that
+    // models have been transformed at a high level to avoid channel-first
+    // convolutions.
+    return linalgOp.emitError(
+        "Only channel-last convolution supported currently.");
+  }
+
+  // [N, OH, OW, C, KW, HW]
+  else if (isa<linalg::DepthwiseConv2DNhwcHwcOp>(linalgOp)) {
     // Notes:
     // =====
     //
@@ -512,14 +558,10 @@ static LogicalResult setRootConfigForConvDecomposePipeline(
     // no checks that the data tiles are not too large, or that the input
     // dimensions are perfectly tiled by the hard-coded tile dimensions below.
     // These will be done as a follow-up task.
-    //
-    //
-    // Below we target a 4x4 array of AIE cores.
     auto getElementType = [](Value v) {
       return cast<ShapedType>(v.getType()).getElementType();
     };
     const uint16_t OW_0 = 4;
-    const uint16_t OH_0 = 4;
     const uint16_t OH_1 = 1;
 
     auto operandType = getElementType(linalgOp->getOperand(0));
@@ -530,26 +572,56 @@ static LogicalResult setRootConfigForConvDecomposePipeline(
       OC_0 = maybeMacNumElements.value();
     }
     // If the operand type has fewer than 32-bits, we really should be able to
-    // get a mac-width for it Bail because we didn't, and there's probably just
-    // something missing in the table.
+    // get a mac-width for it. Bail because we didn't, there's probably just
+    // something missing in a table.
     else if (operandType.getIntOrFloatBitWidth() < 32) {
       return linalgOp.emitError(
           "has an operand type with fewer than 32-bits, but no mac-width "
           "could be determined.");
     }
 
     const uint16_t OC_1 = OC_0 / 4;
-
-    // depthwise_conv2d_nhwc_hwc tiling dims:
-    //               [N, OH,   OW,   OC,   KH,KW]
-    tileSizeLevel0 = {1, OH_0, OW_0, OC_0, 0, 0};
+    packingSizes = {0, 0, 0, OC_1, 0, 0};
+    innerPerm = {{}, {}, {}};
+    outerPerm = {{0, 1, 2, 3}, {0, 1, 2}, {0, 1, 2, 3}};
     tileSizeLevel1 = {1, OH_1, OW_0, OC_1, 0, 0};
-    tileSizeLevel2 = {0, 0, 0, 0, 1, 1};
-  } else {
-    assert(false && "Support must be added for this convolution op");
+    tileSizeLevel2 = {0, 0, 0, 0, 1, 1, 0};
   }
+
+  else {
+    assert(false &&
+           "unrecognised convolution op, cannot set packing config, support "
+           "must be added.");
+  }
+
+  // For the objectFifo backend we currently target a single core.
+  // Next step: increase number of cores targeted, figure out what
+  //
+  // ```
+  // error: 'aie.memtile_dma' op could not find and assign a valid BD id
+  // ```
+  //
+  // means. TODO(newling) create task to track this.
+  tileSizeLevel0 = tileSizeLevel1;
+
+  assert(!innerPerm.empty() && !outerPerm.empty() && !packingSizes.empty() &&
+         !tileSizeLevel0.empty() && !tileSizeLevel1.empty() &&
+         "not all vectors for initializing config are non-empty");
+
+  auto packingConfigLevel1Attr = getPackingConfigPackingLevelAttr(
+      context, packingSizes, transposePackIndices, unpackEmpty, innerPerm,
+      outerPerm);
+  SmallVector<PackingConfigPackingLevelAttr> packingConfigLevelsVal{
+      packingConfigLevel1Attr};
+
+  auto packingConfigLevels =
+      PackingConfigPackingLevelsAttr::get(context, packingConfigLevelsVal);
+  auto config = PackingConfigAttr::get(context, packingConfigLevels);
+  setPackingConfig(linalgOp, config);
+
   TileSizesListType tileSizes = {tileSizeLevel0, tileSizeLevel1,
                                  tileSizeLevel2};
+
   return setOpConfigAndEntryPointFnTranslation(
       entryPointFn, linalgOp, tileSizes,
       IREE::Codegen::DispatchLoweringPassPipeline::Custom);