nod-ai · newling · Oct 9, 2024 · Oct 7, 2024 · Oct 8, 2024 · Oct 8, 2024
diff --git a/build_tools/ci/cpu_comparison/run.py b/build_tools/ci/cpu_comparison/run.py
@@ -641,7 +641,7 @@ def run(self, config):
                 config,
                 test_name,
                 tile_pipeline="conv-decompose",
-                lower_to_aie_pipeline="air",
+                lower_to_aie_pipeline="objectFifo",
                 n_repeats=n_conv_repeats,
             )
 
@@ -661,7 +661,7 @@ def run(self, config):
                 config,
                 test_files_dir / f"{name}.mlir",
                 tile_pipeline="conv-decompose",
-                lower_to_aie_pipeline="air",
+                lower_to_aie_pipeline="objectFifo",
                 n_repeats=n_conv_repeats,
             )
 

@@ -8,7 +8,7 @@ iree_lit_test_suite(
   NAME
     lit
   SRCS
-    "conv2d_nhwc_air_e2e.mlir"
+    "conv2d_nhwc_objectfifo_e2e.mlir"
     "matmul_elementwise_pack_peel_air_e2e.mlir"
     "matmul_pack_peel_air_e2e.mlir"
     "matmul_pack_peel_objectfifo.mlir"

@@ -1,4 +1,4 @@
-// RUN: iree-compile --iree-hal-target-backends=amd-aie --compile-to=executable-sources %s | iree-opt --pass-pipeline="builtin.module(hal.executable(hal.executable.variant(iree-hal-translate-target-executable-variants{target=amd-aie})))" --iree-amdaie-tile-pipeline=conv-decompose --iree-amdaie-lower-to-aie-pipeline=air --split-input-file | FileCheck %s
+// RUN: iree-compile --iree-hal-target-backends=amd-aie --compile-to=executable-sources %s | iree-opt --pass-pipeline="builtin.module(hal.executable(hal.executable.variant(iree-hal-translate-target-executable-variants{target=amd-aie})))" --iree-amdaie-tile-pipeline=conv-decompose --iree-amdaie-lower-to-aie-pipeline=objectFifo --split-input-file | FileCheck %s
 
 func.func @conv_2d_nhwc_hwcf(%arg0: tensor<2x14x14x32xi32>, %arg1: tensor<3x3x32x64xi32>) -> tensor<2x12x12x64xi32> {
   %cst = arith.constant 0 : i32

@@ -20,7 +20,6 @@ namespace {
 static FailureOr<linalg::PackResult> applyPackOnLinalgOp(
     RewriterBase &rewriter, linalg::LinalgOp op,
     SmallVector<OpFoldResult> packedSizes) {
-  // Fail on mismatched number of pack sizes.
   if (packedSizes.size() != op.getNumLoops()) {
     op->emitOpError(
         "requires number of packed sizes match the number of loops (")
@@ -29,12 +28,14 @@ static FailureOr<linalg::PackResult> applyPackOnLinalgOp(
   }
 
   rewriter.setInsertionPoint(op);
-  FailureOr<linalg::PackResult> packResult =
+  FailureOr<linalg::PackResult> maybePackResult =
       linalg::pack(rewriter, op, packedSizes);
-  if (failed(packResult)) {
+  if (failed(maybePackResult)) {
     op->emitOpError("failed to pack the operation");
     return failure();
   }
+
+  linalg::PackResult packResult = maybePackResult.value();
   return packResult;
 }
 
@@ -60,7 +61,8 @@ void AMDAIEPackAndTransposePass::runOnOperation() {
   // Find the linalg op for packing, currently only consider contraction ops
   linalg::LinalgOp linalgOp;
   funcOp->walk([&](linalg::LinalgOp op) {
-    if (linalg::isaContractionOpInterface(op)) {
+    if (linalg::isaContractionOpInterface(op) ||
+        isa<linalg::ConvolutionOpInterface>(op.getOperation())) {
       linalgOp = op;
       return WalkResult::interrupt();
     }
@@ -75,6 +77,7 @@ void AMDAIEPackAndTransposePass::runOnOperation() {
   // Step 1. Before packing the operation, we will prefetch the lowering and
   // packing config.
   auto config = getLoweringConfig<IREE::Codegen::LoweringConfigAttr>(linalgOp);
+
   auto packingConfig = getPackingConfig(linalgOp);
 
   if (!config || !packingConfig) {
@@ -87,6 +90,12 @@ void AMDAIEPackAndTransposePass::runOnOperation() {
   // Extract packing config from the `linalgOp`.
   PackingConfigPackingLevelAttr packCfg =
       packingConfig.getPackingConfigVals(packLevel);
+
+  if (!packCfg) {
+    funcOp->emitOpError("failed to get pack config for pack level ")
+        << packLevel;
+    return signalPassFailure();
+  }
   SmallVector<OpFoldResult> packedSizes =
       getAsIndexOpFoldResult(context, packCfg.getPackedSizes());
 

@@ -472,54 +472,83 @@ static LogicalResult setRootConfigForPadPackPipeline(
 
 static LogicalResult setRootConfigForConvDecomposePipeline(
     mlir::FunctionOpInterface entryPointFn, linalg::LinalgOp linalgOp) {
+  MLIRContext *context = entryPointFn.getContext();
+
   FailureOr<std::array<uint32_t, 3>> maybeInstructionSize =
       getMatmulInstructionSize(linalgOp);
   int64_t OW = 4;
   int64_t OC = 4;
   int64_t IC = 8;
   if (succeeded(maybeInstructionSize)) {
-    auto instructionSize = maybeInstructionSize.value();
-    OW = instructionSize[0];
-    OC = instructionSize[1];
-    IC = instructionSize[2];
+    auto [m, n, k] = maybeInstructionSize.value();
+    OW = m;
+    OC = n;
+    IC = k;
   }
 
+  SmallVector<int64_t> transposePackIndices{0, 1, 2};
+  SmallVector<bool> unpackEmpty{false, false, true};
+
+  // Convolution type specific vectors:
+  SmallVector<SmallVector<int64_t>> innerPerm;
+  SmallVector<SmallVector<int64_t>> outerPerm;
   SmallVector<int64_t> tileSizeLevel0;
   SmallVector<int64_t> tileSizeLevel1;
   SmallVector<int64_t> tileSizeLevel2;
-  // Note: some of the tiling dimensions are hardcoded for now.
-  if (isa<linalg::Conv2DNhwcHwcfOp>(linalgOp) ||
-      isa<linalg::Conv2DNhwcHwcfQOp>(linalgOp)) {
-    // conv_2d_nhwc_hwcf tiling dims: [N, OH, OW, OC, KH, KW, IC].
-    tileSizeLevel0 = {0, 4, OW, OC, 0, 0, 0};
+  SmallVector<int64_t> packingSizes;
+
+  // [N, OH, OW, OC, KH, KW, IC].
+  if (isa<linalg::Conv2DNhwcHwcfQOp>(linalgOp) ||
+      isa<linalg::Conv2DNhwcHwcfOp>(linalgOp)) {
+    // The goal is to pack the input image and kernel as follows, when moving
+    // from L2 to L1 (example where there are 32 input channels):
+    // Image: memref<1x3x6x32xbf16> ->  memref<1x3x4x6x8xbf16>
+    // Kernel: memref<3x3x32x4xbf16> -> memref<3x3x4x1x8x4xbf16>
+    innerPerm = {{}, {{1, 0}}, {}};
+    outerPerm = {{0, 1, 3, 2}, {}, {0, 1, 2, 3}};
+    packingSizes = {0, 0, 0, OC, 0, 0, IC};
+    // Target one column of 4 cores, each core processing a different
+    // output image row. TODO(newling) use 4x4 array.
+    // https://github.com/nod-ai/iree-amd-aie/issues/821
+    tileSizeLevel0 = {1, 4, OW, OC, 0, 0, 0};
     tileSizeLevel1 = {1, 1, OW, OC, 0, 0, 0};
-    tileSizeLevel2 = {0, 0, 0, 0, 1, 1, IC};
-  } else if (isa<linalg::Conv2DNchwFchwOp>(linalgOp)) {
-    // conv_2d_nchw_fchw tiling dims: [N, OC, OH, OW, IC, KH, KW].
-    tileSizeLevel0 = {0, OC, 4, OW, 0, 0, 0};
-    tileSizeLevel1 = {1, OC, 1, OW, 0, 0, 0};
-    tileSizeLevel2 = {0, 0, 0, 0, IC, 1, 1};
-  } else if (isa<linalg::DepthwiseConv2DNhwcHwcOp>(linalgOp)) {
-    // Notes:
+    // scf.for tiling of KH, KW, and (packed) IC dimensions:
+    tileSizeLevel2 = {0, 0, 0, 0, 1, 1, 1, 0, 0};
+  }
+
+  // [N, OC, OH, OW, IC, KH, KW]
+  else if (isa<linalg::Conv2DNchwFchwOp>(linalgOp)) {
+    // The matmul reduction dimension is the input channel (IC) dimension.
+    // For Conv2DNhwcHwcfOp, this dimension is already the inner-most dimension
+    // of the input image, and the penultimate dimension of the kernel --
+    // exactly what we want. For Conv2DNchwFchwOp, can the tensor dimensions be
+    // permuted in DMA to get them in the correct positions? For the image
+    // tensor, only if H*W is a nice power of 2 (DMA constraint). For kernels,
+    // it requires h*w is a nice power of 2 -- unlikely, we typically have
+    // h=w=3. The dimension permutations will therefore often therefore need to
+    // be done on the core. We leave this for future work, the expectation for
+    // now is that models have been transformed at a high level to avoid
+    // channel-first convolutions.
+    return linalgOp.emitError(
+        "Only channel-last convolution supported currently.");
+  }
+
+  // [N, OH, OW, C, KW, HW]
+  else if (isa<linalg::DepthwiseConv2DNhwcHwcOp>(linalgOp)) {
+    // Notes
     // =====
-    //
-    // An inherent property of depthwise convolutions is that they cannot be
-    // expressed in terms of matmuls, unlike the above (dense) conv-2ds. The
-    // tile sizes we choose below are therefore not constrained by the AIE
-    // matmul instructions.
+    // A property of depthwise convolution is that it can't be expressed in
+    // terms of matmul, unlike the above (dense) conv-2ds. The tile sizes we
+    // choose below are therefore not constrained by AIE matmul instructions.
     //
     // The logic is currently fragile, and there are no guardrails: there are
     // no checks that the data tiles are not too large, or that the input
     // dimensions are perfectly tiled by the hard-coded tile dimensions below.
     // These will be done as a follow-up task.
-    //
-    //
-    // Below we target a 4x4 array of AIE cores.
     auto getElementType = [](Value v) {
       return cast<ShapedType>(v.getType()).getElementType();
     };
     const uint16_t OW_0 = 4;
-    const uint16_t OH_0 = 4;
     const uint16_t OH_1 = 1;
 
     auto operandType = getElementType(linalgOp->getOperand(0));
@@ -530,26 +559,49 @@ static LogicalResult setRootConfigForConvDecomposePipeline(
       OC_0 = maybeMacNumElements.value();
     }
     // If the operand type has fewer than 32-bits, we really should be able to
-    // get a mac-width for it Bail because we didn't, and there's probably just
-    // something missing in the table.
+    // get a mac-width for it. Bail because we didn't, there's probably just
+    // something missing in a table.
     else if (operandType.getIntOrFloatBitWidth() < 32) {
       return linalgOp.emitError(
           "has an operand type with fewer than 32-bits, but no mac-width "
           "could be determined.");
     }
 
     const uint16_t OC_1 = OC_0 / 4;
-
-    // depthwise_conv2d_nhwc_hwc tiling dims:
-    //               [N, OH,   OW,   OC,   KH,KW]
-    tileSizeLevel0 = {1, OH_0, OW_0, OC_0, 0, 0};
+    packingSizes = {0, 0, 0, OC_1, 0, 0};
+    innerPerm = {{}, {}, {}};
+    outerPerm = {{0, 1, 2, 3}, {0, 1, 2}, {0, 1, 2, 3}};
+    // Target one column of 4 cores, each core processing a different
+    // output image row. TODO(newling) use 4x4 array.
+    // https://github.com/nod-ai/iree-amd-aie/issues/821
+    tileSizeLevel0 = {1, 4 * OH_1, OW_0, OC_1, 0, 0};
     tileSizeLevel1 = {1, OH_1, OW_0, OC_1, 0, 0};
-    tileSizeLevel2 = {0, 0, 0, 0, 1, 1};
-  } else {
-    assert(false && "Support must be added for this convolution op");
+    tileSizeLevel2 = {0, 0, 0, 0, 1, 1, 0};
   }
+
+  else {
+    return linalgOp.emitError(
+        "unrecognised convolution op, cannot set packing config. ");
+  }
+
+  assert(!innerPerm.empty() && !outerPerm.empty() && !packingSizes.empty() &&
+         !tileSizeLevel0.empty() && !tileSizeLevel1.empty() &&
+         "not all vectors for initializing config are non-empty");
+
+  auto packingConfigLevel1Attr = getPackingConfigPackingLevelAttr(
+      context, packingSizes, transposePackIndices, unpackEmpty, innerPerm,
+      outerPerm);
+  SmallVector<PackingConfigPackingLevelAttr> packingConfigLevelsVal{
+      packingConfigLevel1Attr};
+
+  auto packingConfigLevels =
+      PackingConfigPackingLevelsAttr::get(context, packingConfigLevelsVal);
+  auto config = PackingConfigAttr::get(context, packingConfigLevels);
+  setPackingConfig(linalgOp, config);
+
   TileSizesListType tileSizes = {tileSizeLevel0, tileSizeLevel1,
                                  tileSizeLevel2};
+
   return setOpConfigAndEntryPointFnTranslation(
       entryPointFn, linalgOp, tileSizes,
       IREE::Codegen::DispatchLoweringPassPipeline::Custom);

@@ -408,16 +408,20 @@ void addConvDecomposePassPipeline(OpPassManager &funcPassManager,
                                   TilingConfig &tilingConfig,
                                   bool enableVectorizationPasses,
                                   TilePassPipeline useTilePipeline) {
+  auto addCleanups = [&]() {
+    funcPassManager.addPass(createAMDAIECleanupPass());
+    funcPassManager.addPass(createCanonicalizerPass());
+    funcPassManager.addPass(createCSEPass());
+  };
+
   // First level tiling using scf.forall
   {
     AMDAIETileAndFuseOptions tileFuseOptions;
     tileFuseOptions.tilingLevel = 0;
     tileFuseOptions.useSCFFor = false;
     funcPassManager.addPass(createAMDAIETileAndFusePass(tileFuseOptions));
+    addCleanups();
   }
-  funcPassManager.addPass(createAMDAIECleanupPass());
-  funcPassManager.addPass(createCanonicalizerPass());
-  funcPassManager.addPass(createCSEPass());
 
   // Pad the linalg operation
   {
@@ -441,67 +445,50 @@ void addConvDecomposePassPipeline(OpPassManager &funcPassManager,
     tileFuseOptions.tilingLevel = 1;
     tileFuseOptions.useSCFFor = false;
     funcPassManager.addPass(createAMDAIETileAndFusePass(tileFuseOptions));
+    addCleanups();
   }
-  funcPassManager.addPass(createAMDAIECleanupPass());
-  funcPassManager.addPass(createCanonicalizerPass());
-  funcPassManager.addPass(createCSEPass());
 
   // Fuse fill op into the inner forall loop
   funcPassManager.addPass(createAMDAIEFuseFillIntoForallPass());
-  funcPassManager.addPass(createCanonicalizerPass());
 
-  // Pad the linalg operation
+  // Pack the linalg operation
   {
-    AMDAIEPadOptions padOptions;
-    padOptions.paddingLevel = 1;
-    funcPassManager.addPass(createAMDAIEPadPass(padOptions));
+    AMDAIEPackAndTransposeOptions packOptions;
+    packOptions.packLevel = 0;
+    funcPassManager.addPass(createAMDAIEPackAndTransposePass(packOptions));
   }
 
-  // Only promote the result to local memory
+  // Promote the inputs and results to local memory
   {
     AMDAIEBufferizeToAllocationOptions bufferizeOptions;
     bufferizeOptions.memorySpace = 2;
-    bufferizeOptions.bufferizeOperand = BufferizeOperand::Output;
+    bufferizeOptions.bufferizeOperand = BufferizeOperand::InputOutput;
     funcPassManager.addPass(
         createAMDAIEBufferizeToAllocationPass(bufferizeOptions));
+    addCleanups();
   }
 
-  // Tile the reduction dimension using scf.for
   {
     AMDAIETileAndFuseOptions tileFuseOptions;
     tileFuseOptions.tilingLevel = 2;
     tileFuseOptions.useSCFFor = true;
     funcPassManager.addPass(createAMDAIETileAndFusePass(tileFuseOptions));
-  }
-  funcPassManager.addPass(createAMDAIECleanupPass());
-  funcPassManager.addPass(createCanonicalizerPass());
-  funcPassManager.addPass(createCSEPass());
-
-  // Pad the linalg operation
-  {
-    AMDAIEPadOptions padOptions;
-    padOptions.paddingLevel = 2;
-    funcPassManager.addPass(createAMDAIEPadPass(padOptions));
-  }
-
-  // Promote the inputs to local memory
-  {
-    AMDAIEBufferizeToAllocationOptions bufferizeOptions;
-    bufferizeOptions.memorySpace = 2;
-    bufferizeOptions.bufferizeOperand = BufferizeOperand::Input;
-    funcPassManager.addPass(
-        createAMDAIEBufferizeToAllocationPass(bufferizeOptions));
+    addCleanups();
   }
 
-  // Decompose Conv2d ops to Conv1d ops
-  funcPassManager.addPass(createDecomposeConvolutionToLowerDimOpsPass());
+  LinalgFoldUnitExtentDimsPassOptions opts;
+  opts.useRankReducingSlices = true;
+  funcPassManager.addPass(mlir::createLinalgFoldUnitExtentDimsPass(opts));
 
   // Vectorization passes
+  // FIXME(newling) https://github.com/nod-ai/iree-amd-aie/issues/820
+  enableVectorizationPasses = false;
   appendVectorizationToPipeline(funcPassManager, enableVectorizationPasses);
   funcPassManager.addPass(createCanonicalizerPass());
 
   // Comprehensive bufferization
   addAMDAIEBufferizePasses(funcPassManager, useTilePipeline);
+  funcPassManager.addPass(createHoistStaticallyBoundAllocationsPass());
 }
 
 void buildAMDAIETransformPassPipeline(
@@ -557,6 +544,7 @@ void addAMDAIEObjectFifoLoweringPasses(OpPassManager &passManager,
                                        bool enablePacketFlow) {
   passManager.addPass(createEraseHALDescriptorTypeFromMemRefPass());
   passManager.addPass(memref::createFoldMemRefAliasOpsPass());
+  passManager.addPass(createCanonicalizerPass());
   passManager.addPass(createAMDAIEConvertToDmaPass());
 
   passManager.addPass(createAMDAIENormalizeLoopBoundsPass());
@@ -582,6 +570,7 @@ void addAMDAIEObjectFifoLoweringPasses(OpPassManager &passManager,
   passManager.addPass(createAMDAIEAssignLogicalObjectFifoDepthPass());
   passManager.addPass(createAMDAIEAccessToAcquireReleasePass());
   passManager.addPass(createAMDAIENoneAccessToTemporaryBufferPass());
+
   passManager.addPass(
       createAMDAIEAssignConnectionTypesPass({enablePacketFlow}));
   passManager.addPass(createCSEPass());
@@ -612,6 +601,7 @@ void addAMDAIEObjectFifoLoweringPasses(OpPassManager &passManager,
   passManager.addPass(createCanonicalizerPass());
 
   passManager.addPass(createAMDAIEObjFifoBufferizationPass());
+  passManager.addPass(createAMDAIETemporaryAllocBufferizationPass());
   passManager.addPass(createAMDAIEConnectionToFlowPass());
   passManager.addPass(createAMDAIEAssignPacketIdsPass());