From 19ea8115499a15413911b7426eab959be144e408 Mon Sep 17 00:00:00 2001 From: Abhishek Varma Date: Thu, 8 Aug 2024 20:32:34 +0000 Subject: [PATCH] [ObjectFifo] Create a new pass to split L2 buffers -- This commit introduces a new pass `--iree-amdaie-split-buffers` to split L2 buffers for dealing with Matmul+Elementwise. -- It addresses sub-action 2 as well from https://github.com/nod-ai/iree-amd-aie/issues/644 Signed-off-by: Abhishek Varma --- .../Transforms/AMDAIESplitBuffers.cpp | 156 ++++++++++++++++++ .../iree-amd-aie/Transforms/CMakeLists.txt | 1 + .../iree-amd-aie/Transforms/PassDetail.h | 3 +- .../iree-amd-aie/Transforms/Passes.cpp | 1 + .../AMD-AIE/iree-amd-aie/Transforms/Passes.h | 3 + .../AMD-AIE/iree-amd-aie/Transforms/Passes.td | 6 + .../Transforms/test/CMakeLists.txt | 1 + .../Transforms/test/split_buffers.mlir | 102 ++++++++++++ 8 files changed, 272 insertions(+), 1 deletion(-) create mode 100644 compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIESplitBuffers.cpp create mode 100644 compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/split_buffers.mlir diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIESplitBuffers.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIESplitBuffers.cpp new file mode 100644 index 000000000..0da2b1851 --- /dev/null +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIESplitBuffers.cpp @@ -0,0 +1,156 @@ +// Copyright 2024 The IREE Authors +// +// Licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +#include "aie/Dialect/AIE/IR/AIEDialect.h" +#include "iree-amd-aie/IR/AMDAIEOps.h" +#include "iree-amd-aie/Transforms/AMDAIEDmaUtils.h" +#include "iree-amd-aie/Transforms/Passes.h" +#include "iree-amd-aie/Transforms/Transforms.h" +#include "iree/compiler/Codegen/TransformStrategies/GPU/Common.h" +#include "mlir/Dialect/Linalg/IR/Linalg.h" +#include "mlir/Dialect/SCF/Transforms/Transforms.h" +#include "mlir/Dialect/Vector/IR/VectorOps.h" +#include "mlir/IR/Iterators.h" +#include "mlir/Pass/Pass.h" +#include "mlir/Transforms/GreedyPatternRewriteDriver.h" +#include "mlir/Transforms/LoopInvariantCodeMotionUtils.h" + +#define DEBUG_TYPE "iree-amdaie-split-buffers" + +namespace mlir::iree_compiler::AMDAIE { + +namespace { + +class AMDAIESplitBuffersPass + : public impl::AMDAIESplitBuffersBase { + public: + using AMDAIESplitBuffersBase::AMDAIESplitBuffersBase; + + void getDependentDialects(DialectRegistry ®istry) const override { + registry.insert(); + } + void runOnOperation() override; +}; + +void AMDAIESplitBuffersPass::runOnOperation() { + ModuleOp moduleOp = getOperation(); + IRRewriter rewriter(moduleOp.getContext()); + + DenseMap memrefToNew; + SmallVector consumeOps; + moduleOp.walk([&](AMDAIE::CoreOp coreOp) { + AMDAIE::LogicalObjectFifoConsume candidateConsumeOp = nullptr; + unsigned consumeOpCount = 0; + coreOp.walk([&](AMDAIE::LogicalObjectFifoConsume consumeOp) { + ++consumeOpCount; + if (consumeOpCount == 3) { + candidateConsumeOp = consumeOp; + return WalkResult::interrupt(); + } + return WalkResult::advance(); + }); + if (!candidateConsumeOp) return WalkResult::skip(); + consumeOps.push_back(candidateConsumeOp); + return WalkResult::advance(); + }); + + DenseSet toBeErased; + for (AMDAIE::LogicalObjectFifoConsume candidateConsumeOp : consumeOps) { + LogicalObjectFifoFromMemrefOp sourceObjectFifo = + candidateConsumeOp.getDmaCpyNdOp().getSourceObjectFifo(); + auto sourceAllocOp = + sourceObjectFifo.getMemref().getDefiningOp(); + uint64_t sourceMemrefSpace = sourceObjectFifo.getMemorySpaceAsUInt(); + if (!sourceAllocOp || sourceMemrefSpace != 1) continue; + // Should do similar checks for target. Will do. This is WIP. + LogicalObjectFifoFromMemrefOp targetObjectFifo = + candidateConsumeOp.getDmaCpyNdOp().getTargetObjectFifo(); + auto targetAllocOp = + targetObjectFifo.getMemref().getDefiningOp(); + + // Now we'll create a narrowed buffer. + rewriter.setInsertionPoint(sourceAllocOp); + auto oldSourceMemRefType = cast(sourceAllocOp.getType()); + auto targetMemRefType = cast(targetAllocOp.getType()); + MemRefType newAllocType = MemRefType::get( + targetMemRefType.getNumElements(), targetMemRefType.getElementType(), + MemRefLayoutAttrInterface{}, oldSourceMemRefType.getMemorySpace()); + auto newAllocOp = rewriter.create(rewriter.getUnknownLoc(), + newAllocType); + auto newDeallocOp = rewriter.create( + rewriter.getUnknownLoc(), newAllocOp); + newDeallocOp->moveBefore(&newAllocOp->getBlock()->back()); + + // Although we have the DmaCpyNd user above, the + // logicalobjectfifo.from_memref is used in other DmaCpyNds as well for + // other core ops. + AMDAIE::DmaCpyNdOp l3ToL2DmaOp; + for (Operation *objFifoUserOp : sourceObjectFifo->getUsers()) { + if (auto dmaOp = dyn_cast(objFifoUserOp); + dmaOp.getTargetObjectFifo() == sourceObjectFifo) { + l3ToL2DmaOp = dmaOp; + toBeErased.insert(dmaOp); + break; + } + } + toBeErased.insert(sourceAllocOp); + toBeErased.insert(sourceObjectFifo); + + AMDAIE::DmaCpyNdOp l2ToL1DmaOp = candidateConsumeOp.getDmaCpyNdOp(); + auto type = cast(newAllocOp.getType()); + SmallVector empty; + rewriter.setInsertionPoint(l2ToL1DmaOp.getSourceObjectFifo()); + auto source = rewriter.create( + rewriter.getUnknownLoc(), LogicalObjectFifoType::get(type), + newAllocOp.getResult(), sourceObjectFifo.getTiles()); + + rewriter.setInsertionPoint(l3ToL2DmaOp); + rewriter.create( + l3ToL2DmaOp.getLoc(), source, l3ToL2DmaOp.getTargetMixedOffsets(), + l3ToL2DmaOp.getTargetMixedSizes(), l3ToL2DmaOp.getTargetMixedStrides(), + l3ToL2DmaOp.getSource(), l3ToL2DmaOp.getSourceMixedOffsets(), + l3ToL2DmaOp.getSourceMixedSizes(), l3ToL2DmaOp.getSourceMixedStrides()); + + rewriter.setInsertionPoint(l2ToL1DmaOp); + auto newL2ToL1DmaOp = rewriter.create( + l2ToL1DmaOp.getLoc(), l2ToL1DmaOp.getTarget(), + l2ToL1DmaOp.getTargetMixedOffsets(), l2ToL1DmaOp.getTargetMixedSizes(), + l2ToL1DmaOp.getTargetMixedStrides(), source, + l2ToL1DmaOp.getSourceMixedOffsets(), l2ToL1DmaOp.getSourceMixedSizes(), + l2ToL1DmaOp.getSourceMixedStrides()); + rewriter.replaceOp(l2ToL1DmaOp, newL2ToL1DmaOp); + // We have to discard non-zero offsets as subview has been replaced by a + // dedicated allocated memref. + SmallVector allocShape(type.getShape()); + (void)discardAllNonZeroOffsets( + rewriter, + cast(newL2ToL1DmaOp.getOperation()), + allocShape); + + // Remove old dealloc. + memref::DeallocOp oldDeallocOp; + for (Operation *userOp : sourceAllocOp->getUsers()) { + if (auto deallocUser = dyn_cast(userOp)) { + oldDeallocOp = deallocUser; + } + } + if (oldDeallocOp) { + rewriter.eraseOp(oldDeallocOp); + } + } + + for (Operation *op : toBeErased) { + op->dropAllUses(); + rewriter.eraseOp(op); + } +} + +} // namespace + +std::unique_ptr createAMDAIESplitBuffersPass() { + return std::make_unique(); +} + +} // namespace mlir::iree_compiler::AMDAIE diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/CMakeLists.txt b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/CMakeLists.txt index 488082cbb..c6edd665b 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/CMakeLists.txt +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/CMakeLists.txt @@ -82,6 +82,7 @@ iree_cc_library( "AMDAIEPad.cpp" "AMDAIEPeelForLoop.cpp" "AMDAIEPropagateDataLayout.cpp" + "AMDAIESplitBuffers.cpp" "AMDAIETile.cpp" "AMDAIETileAndFuse.cpp" "AMDAIEUtils.cpp" diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/PassDetail.h b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/PassDetail.h index 5a55e6fa2..5be6ddf82 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/PassDetail.h +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/PassDetail.h @@ -60,11 +60,12 @@ namespace mlir::iree_compiler::AMDAIE { #define GEN_PASS_DEF_AMDAIEPACKANDTRANSPOSE #define GEN_PASS_DEF_AMDAIEPACKTODMA #define GEN_PASS_DEF_AMDAIEPAD -#define GEN_PASS_DEF_AMDAIEVECTORIZATION #define GEN_PASS_DEF_AMDAIEPEELFORLOOP #define GEN_PASS_DEF_AMDAIEPROPAGATEDATALAYOUT +#define GEN_PASS_DEF_AMDAIESPLITBUFFERS #define GEN_PASS_DEF_AMDAIETILE #define GEN_PASS_DEF_AMDAIETILEANDFUSE +#define GEN_PASS_DEF_AMDAIEVECTORIZATION #include "iree-amd-aie/Transforms/Passes.h.inc" } // namespace mlir::iree_compiler::AMDAIE diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp index 3595cfc72..d61b8a2ba 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp @@ -578,6 +578,7 @@ void addAMDAIEObjectFifoLoweringPasses(OpPassManager &passManager) { passManager.addPass(createAMDAIEDistributeCoresAndObjectFifosPass()); passManager.addPass(createCSEPass()); passManager.addPass(createCanonicalizerPass()); + passManager.addPass(createAMDAIESplitBuffersPass()); passManager.addPass(createAMDAIEDmaToCircularDmaPass()); passManager.addNestedPass(createAMDAIECreateAIEWorkgroupPass()); diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.h b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.h index b2507d405..b2b419f05 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.h +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.h @@ -197,6 +197,9 @@ std::unique_ptr createAMDAIEPadPass(AMDAIEPadOptions options = {}); std::unique_ptr createAMDAIEPeelForLoopPass( AMDAIEPeelForLoopOptions options = {}); +/// Create a pass to split buffers. +std::unique_ptr createAMDAIESplitBuffersPass(); + /// Create pass to tile TilingInterface operations. std::unique_ptr createAMDAIETilePass(AMDAIETileOptions options = {}); diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.td b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.td index 68086dab4..e54670612 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.td +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.td @@ -402,6 +402,12 @@ def AMDAIEPropagateDataLayout : let constructor = "mlir::iree_compiler::AMDAIE::createAMDAIEPropagateDataLayoutPass()"; } +def AMDAIESplitBuffers : + Pass<"iree-amdaie-split-buffers", "ModuleOp"> { + let summary = "Split buffers."; + let constructor = "mlir::iree_compiler::AMDAIE::createAMDAIESplitBuffersPass()"; +} + def AMDAIETile : InterfacePass<"iree-amdaie-tile", "mlir::FunctionOpInterface"> { let summary = "Pass to tile TilingInterface operations."; diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/CMakeLists.txt b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/CMakeLists.txt index a55888807..4909174e6 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/CMakeLists.txt +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/CMakeLists.txt @@ -54,6 +54,7 @@ iree_lit_test_suite( "pad.mlir" "peel_for_loop.mlir" "propagate_data_layout.mlir" + "split_buffers.mlir" "tile_and_fuse_using_scf_for.mlir" "tile_and_fuse_using_scf_forall.mlir" "tile_copy_using_scf_for.mlir" diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/split_buffers.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/split_buffers.mlir new file mode 100644 index 000000000..62b48e288 --- /dev/null +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/split_buffers.mlir @@ -0,0 +1,102 @@ +// RUN: iree-opt --pass-pipeline="builtin.module(iree-amdaie-split-buffers,cse)" --split-input-file --verify-diagnostics %s | FileCheck %s + +// CHECK-LABEL: @split_l2_buffer +// CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index +// CHECK-DAG: %[[C3:.*]] = arith.constant 3 : index +// CHECK-DAG: %[[L3_ALLOC:.*]] = memref.alloc() : memref<128x128xi32> +// CHECK-DAG: %[[L2_ALLOC:.*]] = memref.alloc() : memref<1024xi32, 1 : i32> +// CHECK-DAG: %[[L1_ALLOC:.*]] = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32> +// CHECK: %[[TILE:.*]] = amdaie.tile(%[[C1]], %[[C3]]) +// CHECK: %[[L2_OBJECTFIFO:.*]] = amdaie.logicalobjectfifo.from_memref %[[L2_ALLOC]], {%[[TILE]]} : +// CHECK-SAME: memref<1024xi32, 1 : i32> -> !amdaie.logicalobjectfifo> +// CHECK: %[[L3_OBJECTFIFO:.*]] = amdaie.logicalobjectfifo.from_memref %[[L3_ALLOC]], {%[[TILE]]} : +// CHECK-SAME: memref<128x128xi32> -> !amdaie.logicalobjectfifo> +// CHECK: scf.forall +// CHECK: %[[DMA_CPY_ND_L3_TO_L2:.*]] = amdaie.dma_cpy_nd(%[[L2_OBJECTFIFO]] +// CHECK-SAME: %[[L3_OBJECTFIFO]] +// CHECK: amdaie.logicalobjectfifo.from_memref +// CHECK: amdaie.logicalobjectfifo.from_memref +// CHECK: amdaie.dma_cpy_nd +// CHECK: amdaie.dma_cpy_nd +// CHECK: %[[L1_OBJECTFIFO:.*]] = amdaie.logicalobjectfifo.from_memref %[[L1_ALLOC]] +// CHECK: %[[DMA_CPY_ND_L2_TO_L1:.*]] = amdaie.dma_cpy_nd(%[[L1_OBJECTFIFO]] +// CHECK-SAME: %[[L2_OBJECTFIFO]] +// CHECK: amdaie.core(%[[TILE]]) { +// CHECK: amdaie.logicalobjectfifo.consume +// CHECK: amdaie.logicalobjectfifo.consume +// CHECK: linalg.generic +// CHECK: amdaie.logicalobjectfifo.consume(%[[DMA_CPY_ND_L2_TO_L1]]) +// CHECK: } +// CHECK: memref.dealloc %[[L2_ALLOC]] : memref<1024xi32, 1 : i32> +#map = affine_map<(d0) -> (d0 * 64)> +#map1 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)> +#map2 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)> +#map3 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)> +#map4 = affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)> +module { + func.func @split_l2_buffer(%arg0: !amdaie.logicalobjectfifo>, %arg1: !amdaie.logicalobjectfifo>, %arg2: !amdaie.logicalobjectfifo>, %arg3: !amdaie.logicalobjectfifo>) { + %c3 = arith.constant 3 : index + %c16 = arith.constant 16 : index + %c8 = arith.constant 8 : index + %c4 = arith.constant 4 : index + %c128 = arith.constant 128 : index + %c2048 = arith.constant 2048 : index + %c256 = arith.constant 256 : index + %c1024 = arith.constant 1024 : index + %c4096 = arith.constant 4096 : index + %c32 = arith.constant 32 : index + %c2 = arith.constant 2 : index + %c1 = arith.constant 1 : index + %c0 = arith.constant 0 : index + %alloc = memref.alloc() : memref<2x1x32x32xi32, 1 : i32> + %alloc_0 = memref.alloc() : memref<1x2x32x32xi32, 1 : i32> + %alloc_1 = memref.alloc() : memref<2x2x32x32xi32, 1 : i32> + %alloc_2 = memref.alloc() : memref<128x128xi32> + %alloc_3 = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32> + %tile = amdaie.tile(%c1, %c3) + %0 = amdaie.logicalobjectfifo.from_memref %alloc_1, {%tile} : memref<2x2x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo> + %1 = amdaie.logicalobjectfifo.from_memref %alloc_2, {%tile} : memref<128x128xi32> -> !amdaie.logicalobjectfifo> + scf.forall (%arg4, %arg5) in (2, 2) { + %2 = affine.apply #map(%arg5) + %3 = affine.apply #map(%arg4) + %4 = amdaie.dma_cpy_nd(%0[%c0, %c0, %c0, %c0] [%c2, %c2, %c32, %c32] [%c2048, %c1024, %c32, %c1], %1[%c0, %c0, %3, %2] [%c2, %c2, %c32, %c32] [%c4096, %c32, %c128, %c1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %tile_4 = amdaie.tile(%c1, %c3) + %5 = amdaie.logicalobjectfifo.from_memref %alloc, {%tile} : memref<2x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo> + %6 = amdaie.logicalobjectfifo.from_memref %alloc_0, {%tile} : memref<1x2x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo> + %7 = amdaie.dma_cpy_nd(%arg0[%c0, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c4, %c8, %c4, %c8] [%c1024, %c1024, %c256, %c32, %c8, %c1], %5[%c1, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c4, %c8, %c4, %c8] [%c1024, %c1024, %c8, %c128, %c32, %c1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %8 = amdaie.dma_cpy_nd(%arg1[%c0, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c8, %c4, %c8, %c4] [%c1024, %c1024, %c128, %c32, %c4, %c1], %6[%c0, %c1, %c0, %c0, %c0, %c0] [%c1, %c1, %c8, %c4, %c8, %c4] [%c2048, %c1024, %c4, %c256, %c32, %c1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %9 = amdaie.logicalobjectfifo.from_memref %alloc_3, {%tile} : memref<1x1x8x8x4x4xi32, 2 : i32> -> !amdaie.logicalobjectfifo> + %10 = amdaie.dma_cpy_nd(%9[%c0, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c8, %c8, %c4, %c4] [%c1024, %c1024, %c128, %c16, %c4, %c1], %0[%c1, %c1, %c0, %c0, %c0, %c0] [%c1, %c1, %c8, %c8, %c4, %c4] [%c2048, %c1024, %c4, %c128, %c32, %c1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %11 = amdaie.dma_cpy_nd(%arg3[%c1, %c1, %c0, %c0] [%c1, %c1, %c32, %c32] [%c2048, %c1024, %c32, %c1], %arg2[%c0, %c0, %c0, %c0] [%c8, %c4, %c8, %c4] [%c16, %c4, %c128, %c1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %12 = amdaie.core(%tile_4) { + amdaie.logicalobjectfifo.consume(%7) + amdaie.logicalobjectfifo.consume(%8) + %13 = amdaie.logicalobjectfifo.access(%arg0, Read) : !amdaie.logicalobjectfifo> -> memref<1x1x4x8x4x8xi32, 2 : i32> + %14 = amdaie.logicalobjectfifo.access(%arg1, Read) : !amdaie.logicalobjectfifo> -> memref<1x1x8x4x8x4xi32, 2 : i32> + %15 = amdaie.logicalobjectfifo.access(%arg2, None) : !amdaie.logicalobjectfifo> -> memref<1x1x8x8x4x4xi32, 2 : i32> + linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%13, %14 : memref<1x1x4x8x4x8xi32, 2 : i32>, memref<1x1x8x4x8x4xi32, 2 : i32>) outs(%15 : memref<1x1x8x8x4x4xi32, 2 : i32>) { + ^bb0(%in: i32, %in_5: i32, %out: i32): + %18 = arith.muli %in, %in_5 : i32 + %19 = arith.addi %out, %18 : i32 + linalg.yield %19 : i32 + } + amdaie.logicalobjectfifo.consume(%10) + %16 = amdaie.logicalobjectfifo.access(%arg2, Read) : !amdaie.logicalobjectfifo> -> memref<1x1x8x8x4x4xi32, 2 : i32> + %17 = amdaie.logicalobjectfifo.access(%arg2, Write) : !amdaie.logicalobjectfifo> -> memref<1x1x8x8x4x4xi32, 2 : i32> + linalg.generic {indexing_maps = [#map4, #map4, #map4], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%15, %16 : memref<1x1x8x8x4x4xi32, 2 : i32>, memref<1x1x8x8x4x4xi32, 2 : i32>) outs(%17 : memref<1x1x8x8x4x4xi32, 2 : i32>) { + ^bb0(%in: i32, %in_5: i32, %out: i32): + %18 = arith.addi %in, %in_5 : i32 + linalg.yield %18 : i32 + } + amdaie.logicalobjectfifo.produce(%11) + amdaie.end + } + } {mapping = [#gpu.block, #gpu.block]} + memref.dealloc %alloc : memref<2x1x32x32xi32, 1 : i32> + memref.dealloc %alloc_3 : memref<1x1x8x8x4x4xi32, 2 : i32> + memref.dealloc %alloc_0 : memref<1x2x32x32xi32, 1 : i32> + memref.dealloc %alloc_1 : memref<2x2x32x32xi32, 1 : i32> + memref.dealloc %alloc_2 : memref<128x128xi32> + return + } +}