[DistributeCoresAndObjectFifos] Fix for case of mixed operands (subvi…

…ews and allocs). (#811) Fix for a regression in depthwise convolution with packing, when we use the upstream pass to remove unit extent dimensions `useRankReducingSlices = true`. Enabling the unit extent dimension removal results in a op: ``` linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%alloc : memref<4x4xi32, 2 : i32>) outs(%subview : memref<4x4xi32, strided<[4, 1]>, 2 : i32>) { ... } ``` appearing in the IR, which is new because it's the first time we see an op with one operand directly from a `memref.alloc`, and another from a `memref.subview` operation. This PR adds a fix for this case (by simplifying the logic...). Before this PR, the assumption was that all/none of the operands were from `memref.alloc` (I think). --------- Co-authored-by: Abhishek Varma <abhvarma@amd.com>
nod-ai · Oct 2, 2024 · a88304a · a88304a
1 parent aae7cc6
commit a88304a
Show file tree

Hide file tree

Showing 2 changed files with 49 additions and 13 deletions.
diff --git a/...er/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEDistributeCoresAndObjectFifos.cpp b/...er/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEDistributeCoresAndObjectFifos.cpp
@@ -637,17 +637,6 @@ LogicalResult insertLogicalObjectFifoAccess(ModuleOp moduleOp) {
         memrefToLogicalObjectFifoAccess;
 
     WalkResult res = coreOp->walk([&](Operation *op) {
-      bool hasAllocOperand = [op]() {
-        for (Value operand : op->getOperands()) {
-          if (isa_and_present<memref::AllocOp>(operand.getDefiningOp()))
-            return true;
-        }
-        return false;
-      }();
-
-      if (!hasAllocOperand) {
-        return WalkResult::advance();
-      }
       // We want to insert amdaie.logicalobjectfifo.access ops right before
       // the first usage. But for vectorized ops this would mean they'd get
       // inserted within the vectorized scf.for ops. We therefore would want
@@ -659,6 +648,9 @@ LogicalResult insertLogicalObjectFifoAccess(ModuleOp moduleOp) {
         opToInsertRewriterPoint = opToInsertRewriterPoint->getParentOp();
       }
       for (auto &&[idx, operand] : llvm::enumerate(op->getOpOperands())) {
+        Operation *operandDefiningOp = operand.get().getDefiningOp();
+        if (!dyn_cast_if_present<memref::AllocOp>(operandDefiningOp))
+          continue;
         if (memrefToLogicalObjectFifoAccess.contains(operand.get())) {
           op->setOperand(idx, memrefToLogicalObjectFifoAccess[operand.get()]);
         } else if (memrefToLogicalObjectFifo.contains(operand.get())) {
@@ -674,10 +666,12 @@ LogicalResult insertLogicalObjectFifoAccess(ModuleOp moduleOp) {
                        llvm::dyn_cast<MemRefType>(operand.get().getType())) {
           Value memref = operand.get();
           rewriter.setInsertionPoint(coreOp);
+
           auto logicalObjectFifo =
               rewriter.create<AMDAIE::LogicalObjectFifoFromMemrefOp>(
                   rewriter.getUnknownLoc(), LogicalObjectFifoType::get(type),
                   memref);
+
           rewriter.setInsertionPoint(opToInsertRewriterPoint);
 
           AMDAIE::LogicalObjectFifoAccessOp accessOp;

diff --git a/...plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/distribute_cores_and_objectfifos.mlir b/...plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/distribute_cores_and_objectfifos.mlir
@@ -680,8 +680,8 @@ func.func @l1_temporary_buffer_for_matmul_elem() {
 
 // -----
 
-// A case where an L1 memory is not distributable. Note: this form arises with a 
-// pad-based tiling strategy. 
+// A case where an L1 memory is not distributable. Note: this form arises with a
+// pad-based tiling strategy.
 // CHECK-LABEL: @not_distributable
 // CHECK: memref.alloc() : memref<2x2x100xbf16, 2>
 // CHECK: memref.subview
@@ -1041,3 +1041,45 @@ module {
     return
   }
 }
+
+// -----
+
+// Testing fix where linalg.generic has a mix of subview and direct alloc operands.
+// Before fix, this results in 'error: operand #0 does not dominate this use'.
+
+
+// CHECK-LABEL: mixed_alloc_subview_operands
+// CHECK: amdaie.core
+// CHECK-DAG: %[[ACCESS_0:.*]] = amdaie.logicalobjectfifo.access{{.*}} -> memref<1x1x4x1x4xi32, 2 : i32>
+// CHECK-DAG: %[[ACCESS_1:.*]] = amdaie.logicalobjectfifo.access{{.*}} -> memref<4x4xi32, 2 : i32>
+// CHECK-DAG: %[[SUBVIEW:.*]] = memref.subview %[[ACCESS_0]]
+// CHECK: linalg.generic
+// CHECK-SAME: ins(%[[ACCESS_1]] : memref<4x4xi32, 2 : i32>) outs(%[[SUBVIEW:.*]] : memref<4x4xi32, strided<[4, 1]>, 2 : i32>) {
+
+#map = affine_map<(d0, d1) -> (d0, d1)>
+#translation = #iree_codegen.translation_info<Custom>
+module {
+  func.func @mixed_alloc_subview_operands() attributes {translation_info = #translation} {
+    %c2 = arith.constant 2 : index
+    %c0_i32 = arith.constant 0 : i32
+    %alloc = memref.alloc() : memref<4x4xi32, 2 : i32>
+    %alloc_0 = memref.alloc() : memref<1x1x4x1x4xi32, 2 : i32>
+    %alloc_1 = memref.alloc() : memref<1x1x4x4xi32, 1 : i32>
+    %0 = amdaie.logicalobjectfifo.from_memref %alloc_1, {} : memref<1x1x4x4xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x4x4xi32, 1 : i32>>
+    %1 = amdaie.logicalobjectfifo.from_memref %alloc_0, {} : memref<1x1x4x1x4xi32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x4x1x4xi32, 2 : i32>>
+    scf.forall (%arg0, %arg1, %arg2, %arg3) in (1, 1, 1, 1) {
+      %2 = amdaie.dma_cpy_nd(%0[0, 0, 0, 0] [1, 1, 4, 4] [16, 16, 4, 1], %1[0, 0, 0, 0, 0] [1, 1, 4, 1, 4] [16, 16, 4, 4, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x4x4xi32, 1 : i32>>, !amdaie.logicalobjectfifo<memref<1x1x4x1x4xi32, 2 : i32>>)
+      %tile = amdaie.tile(%arg1, %c2)
+      %3 = amdaie.core(%tile, in : [], out : [%2]) {
+        linalg.fill ins(%c0_i32 : i32) outs(%alloc_0 : memref<1x1x4x1x4xi32, 2 : i32>)
+        %subview = memref.subview %alloc_0[0, 0, 0, 0, 0] [1, 1, 4, 1, 4] [1, 1, 1, 1, 1] : memref<1x1x4x1x4xi32, 2 : i32> to memref<4x4xi32, strided<[4, 1]>, 2 : i32>
+        linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%alloc : memref<4x4xi32, 2 : i32>) outs(%subview : memref<4x4xi32, strided<[4, 1]>, 2 : i32>) {
+        ^bb0(%in: i32, %out: i32):
+          linalg.yield %in : i32
+        }
+        amdaie.end
+      }
+    } {mapping = [#gpu.thread<y>, #gpu.thread<x>, #gpu.thread<z>, #gpu.thread<linear_dim_0>]}
+    return
+  }
+}