From 3679e9cfd77edcd96f0174ded28993a6bafa9128 Mon Sep 17 00:00:00 2001
From: MaheshRavishankar <1663364+MaheshRavishankar@users.noreply.github.com>
Date: Mon, 8 May 2023 11:33:06 -0700
Subject: [PATCH] Pad handling without changing upstream interface. (#13133)

The current default dispatch region formation has options to

- disable splitting pad into fill + tensor.insert_slice
- allow fusion of pad with producer
- allow fusion of pad with consumer.
While none of these are on by default, this PR adds support for handling these in the CPU backend. The current state is

- The pad by itself in a dispatch gets vectorized.
- Pad fused with consumer gets vectorized too
- Pad fused with producer does not get vectorized. This requries more work and potentially some changes to get the IR into a better state w.r.t destination passing.

There is lit test that show the handling of the different modes today within the CPU backend. To get things working, one thing to handle is the code-generated by tiling the pad operation is of the form

```
scf.if {
  ...
} else {
  ... tensor.pad
}
```

the if here is to account for cases where a tile could be reading only the padding. This does not happen in IREE, so there is a temporary hack here that just folds the if away. Long term a better solution is needed (probably requiring rethinking of pad specification and tiling).
---
 .../Common/IREEComprehensiveBufferizePass.cpp |  13 ++
 .../Common/TileDispatchUsingInterface.cpp     |   6 +-
 .../PartitionableLoopsInterface.cpp           |   2 +
 .../Codegen/LLVMCPU/KernelDispatch.cpp        | 111 +++++++--
 .../Codegen/LLVMCPU/LLVMCPUTileAndFuse.cpp    |  40 ++++
 .../iree/compiler/Codegen/LLVMCPU/Passes.cpp  |  59 +++--
 .../compiler/Codegen/LLVMCPU/test/BUILD.bazel |   1 +
 .../Codegen/LLVMCPU/test/CMakeLists.txt       |   1 +
 .../LLVMCPU/test/pad_pipeline_tests.mlir      | 220 ++++++++++++++++++
 .../src/iree/compiler/Codegen/Utils/Utils.cpp |  17 +-
 .../iree/compiler/Tools/init_mlir_dialects.h  |   2 +-
 11 files changed, 410 insertions(+), 62 deletions(-)
 create mode 100644 compiler/src/iree/compiler/Codegen/LLVMCPU/test/pad_pipeline_tests.mlir
diff --git a/compiler/src/iree/compiler/Codegen/Common/IREEComprehensiveBufferizePass.cpp b/compiler/src/iree/compiler/Codegen/Common/IREEComprehensiveBufferizePass.cpp
index 0cf1e63ab248..6790319f9626 100644
--- a/compiler/src/iree/compiler/Codegen/Common/IREEComprehensiveBufferizePass.cpp
+++ b/compiler/src/iree/compiler/Codegen/Common/IREEComprehensiveBufferizePass.cpp
@@ -175,6 +175,19 @@ LogicalResult eliminateEmptyTensors(
 
 void EliminateEmptyTensorsPass::runOnOperation() {
   ModuleOp moduleOp = getOperation();
+  MLIRContext *context = &getContext();
+
+  // Run the convert to destination style patterns.
+  {
+    RewritePatternSet patterns(context);
+    linalg::populateConvertToDestinationStylePatterns(patterns);
+    if (failed(applyPatternsAndFoldGreedily(moduleOp, std::move(patterns)))) {
+      moduleOp->emitOpError(
+          "Failed in conversion to destination style patterns");
+      return signalPassFailure();
+    }
+  }
+
   OneShotBufferizationOptions options = getBufferizationOptions();
 
   IRRewriter rewriter(moduleOp->getContext());
diff --git a/compiler/src/iree/compiler/Codegen/Common/TileDispatchUsingInterface.cpp b/compiler/src/iree/compiler/Codegen/Common/TileDispatchUsingInterface.cpp
index ab19bb29df1b..606d0de6728b 100644
--- a/compiler/src/iree/compiler/Codegen/Common/TileDispatchUsingInterface.cpp
+++ b/compiler/src/iree/compiler/Codegen/Common/TileDispatchUsingInterface.cpp
@@ -450,9 +450,9 @@ static SmallVector<Operation *> getAllFusableProducers(TilingInterface op) {
     Operation *currOp = worklist.front();
     worklist.pop_front();
     for (OpOperand &operand : currOp->getOpOperands()) {
-      auto tilingInterfaceProducer =
-          operand.get().getDefiningOp<TilingInterface>();
-      if (!tilingInterfaceProducer ||
+      Operation *definingOp = operand.get().getDefiningOp();
+      auto tilingInterfaceProducer = dyn_cast<TilingInterface>(definingOp);
+      if (!tilingInterfaceProducer || isa<tensor::PadOp>(definingOp) ||
           producers.count(tilingInterfaceProducer)) {
         continue;
       }
diff --git a/compiler/src/iree/compiler/Codegen/Interfaces/PartitionableLoopsInterface.cpp b/compiler/src/iree/compiler/Codegen/Interfaces/PartitionableLoopsInterface.cpp
index adf7618dbdc8..8828c0bc6402 100644
--- a/compiler/src/iree/compiler/Codegen/Interfaces/PartitionableLoopsInterface.cpp
+++ b/compiler/src/iree/compiler/Codegen/Interfaces/PartitionableLoopsInterface.cpp
@@ -239,6 +239,8 @@ void registerPartitionableLoopsInterfaceModels(DialectRegistry &registry) {
   registry.addExtension(+[](MLIRContext *ctx, tensor::TensorDialect *dialect) {
     tensor::PackOp::attachInterface<
         OuterParallelAsPartitionableLoops<tensor::PackOp>>(*ctx);
+    tensor::PadOp::attachInterface<
+        OuterParallelAsPartitionableLoops<tensor::PadOp>>(*ctx);
     tensor::UnPackOp::attachInterface<
         OuterParallelAsPartitionableLoops<tensor::UnPackOp>>(*ctx);
   });
diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp b/compiler/src/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp
index 218f9915bc3b..143a0f7734e9 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp
@@ -171,6 +171,22 @@ static llvm::raw_ostream &operator<<(
   return os;
 }
 
+/// Splits the given `Range` vector and returns the `lbs` and the `ubs` as
+/// separate lists.
+static void getBoundsFromRange(ArrayRef<Range> loopRange,
+                               SmallVector<int64_t> &lb,
+                               SmallVector<int64_t> &ub) {
+  auto getStaticValue = [](OpFoldResult ofr) -> int64_t {
+    Optional<int64_t> intVal = getConstantIntValue(ofr);
+    if (!intVal) return ShapedType::kDynamic;
+    return intVal.value();
+  };
+  lb = llvm::to_vector(llvm::map_range(
+      loopRange, [&](Range r) { return getStaticValue(r.offset); }));
+  ub = llvm::to_vector(llvm::map_range(
+      loopRange, [&](Range r) { return getStaticValue(r.size); }));
+}
+
 /// Returns true if all the input and output tensor operands of 'op' are fully
 /// dynamic.
 static bool isFullyDynamicOp(linalg::LinalgOp op) {
@@ -1751,6 +1767,46 @@ static LogicalResult setRootConfig(func::FuncOp entryPointFn,
   return setConvRootConfig(entryPointFn, convOp, targetTileSizes, vectorSize);
 }
 
+static LogicalResult setRootConfig(func::FuncOp entryPointFn,
+                                   tensor::PadOp padOp) {
+  OpBuilder builder(padOp.getContext());
+  builder.setInsertionPoint(padOp);
+  SmallVector<Range> iterationDomain =
+      cast<TilingInterface>(padOp.getOperation()).getIterationDomain(builder);
+  SmallVector<int64_t> lbs, ubs;
+  getBoundsFromRange(iterationDomain, lbs, ubs);
+
+  SmallVector<int64_t> minTileSizes(lbs.size(), 1);
+  SmallVector<int64_t> maxTileSizes(ubs.size(), defaultWorkgroupTileSize);
+  SmallVector<int64_t> vectorTileSizes(lbs.size(), 1);
+
+  unsigned typeWidthInBytes = IREE::Util::getRoundedElementByteWidth(
+      padOp.getResultType().getElementType());
+  int64_t typeVectorSize = getVectorSize(entryPointFn, typeWidthInBytes);
+  vectorTileSizes.back() = (ubs.back() == ShapedType::kDynamic
+                                ? 1
+                                : std::min(typeVectorSize, ubs.back()));
+  minTileSizes.back() = vectorTileSizes.back();
+
+  SmallVector<unsigned> partitionableLoops =
+      cast<PartitionableLoopsInterface>(padOp.getOperation())
+          .getPartitionableLoops(kNumMaxParallelDims);
+  SmallVector<int64_t> distributedTileSizes =
+      getDefaultDistributedLevelTileSizes(partitionableLoops, lbs, ubs,
+                                          minTileSizes, maxTileSizes);
+  TileSizesListType tileSizes;
+  // Distribution tiling
+  tileSizes.emplace_back(std::move(distributedTileSizes));
+  // Tiling for vectorization.
+  tileSizes.emplace_back(std::move(vectorTileSizes));
+  // No further tiling.
+  tileSizes.push_back({});
+
+  return setOpConfigAndEntryPointFnTranslation(
+      entryPointFn, padOp, tileSizes,
+      DispatchLoweringPassPipeline::CPUDoubleTilingExpert);
+}
+
 /// Set default configuration for Linalg ops.
 static LogicalResult setRootConfig(
     func::FuncOp entryPointFn, linalg::LinalgOp linalgOp,
@@ -1812,12 +1868,13 @@ static LogicalResult setRootConfigImpl(
           return setRootConfig(entryPointFn, op, LinalgOpInfo(op),
                                targetMLTransInfo);
         })
-        .Case<IREE::LinalgExt::FftOp, tensor::PackOp, linalg::Mmt4DOp,
-              linalg::Conv2DNhwcHwcfOp, linalg::Conv2DNchwFchwOp,
-              linalg::PoolingNhwcSumOp, linalg::PoolingNhwcMaxOp,
-              linalg::PoolingNhwcMaxUnsignedOp, linalg::PoolingNhwcMinOp,
-              linalg::PoolingNhwcMinUnsignedOp, linalg::PoolingNchwSumOp,
-              linalg::PoolingNchwMaxOp, linalg::DepthwiseConv2DNhwcHwcOp>(
+        .Case<IREE::LinalgExt::FftOp, tensor::PackOp, tensor::PadOp,
+              linalg::Mmt4DOp, linalg::Conv2DNhwcHwcfOp,
+              linalg::Conv2DNchwFchwOp, linalg::PoolingNhwcSumOp,
+              linalg::PoolingNhwcMaxOp, linalg::PoolingNhwcMaxUnsignedOp,
+              linalg::PoolingNhwcMinOp, linalg::PoolingNhwcMinUnsignedOp,
+              linalg::PoolingNchwSumOp, linalg::PoolingNchwMaxOp,
+              linalg::DepthwiseConv2DNhwcHwcOp>(
             [&](auto op) { return setRootConfig(entryPointFn, op); })
         .Case<tensor::UnPackOp>(
             [&](auto op) { return setUnPackOpRootConfig(entryPointFn, op); })
@@ -1867,21 +1924,47 @@ static LogicalResult setVMVXRootConfigImpl(func::FuncOp entryPointFn,
 /// to the end of the function is the root op.
 static FailureOr<Operation *> getRootOperation(
     ArrayRef<Operation *> computeOps) {
+  Operation *rootOperation = nullptr;
   for (auto op : llvm::reverse(computeOps)) {
-    auto linalgOp = dyn_cast<linalg::LinalgOp>(op);
-    if (!linalgOp) continue;
-    if (linalgOp.getNumReductionLoops()) return op;
+    if (auto linalgOp = dyn_cast<linalg::LinalgOp>(op)) {
+      // Do not treat linalg ops that are all parallel as root operations in
+      // this sweep.
+      if (linalgOp.getNumLoops() == linalgOp.getNumParallelLoops()) continue;
+
+      // All other linalg ops are root ops.
+      rootOperation = op;
+      break;
+    }
+
+    if (isa<TilingInterface>(op) &&
+        !isa<tensor::PadOp, tensor::PackOp, tensor::UnPackOp>(op)) {
+      // All other operations that implement this interface are root ops.
+      rootOperation = op;
+      break;
+    }
   }
 
-  for (auto op : llvm::reverse(computeOps)) {
-    if (isa<linalg::LinalgOp, IREE::LinalgExt::LinalgExtOp>(op)) return op;
+  if (!rootOperation) {
+    // Check for elementwise operations.
+    for (auto op : llvm::reverse(computeOps)) {
+      if (isa<linalg::LinalgOp>(op)) {
+        rootOperation = op;
+        break;
+      }
+    }
   }
 
-  for (auto op : llvm::reverse(computeOps)) {
-    if (isa<TilingInterface>(op)) return op;
+  if (!rootOperation) {
+    // Check for pad/pack/unpack ops by themselves.
+    for (auto op : llvm::reverse(computeOps)) {
+      if (isa<tensor::PadOp, tensor::PackOp, tensor::UnPackOp>(op)) {
+        rootOperation = op;
+        break;
+      }
+    }
   }
 
-  return nullptr;
+  return rootOperation;
 }
 
 static LogicalResult adjustTileSizesForPackOp(func::FuncOp entryPointFn,
diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/LLVMCPUTileAndFuse.cpp b/compiler/src/iree/compiler/Codegen/LLVMCPU/LLVMCPUTileAndFuse.cpp
index e4e40c0f36aa..756df4c12bc6 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMCPU/LLVMCPUTileAndFuse.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/LLVMCPUTileAndFuse.cpp
@@ -45,6 +45,35 @@ static void collectTiledAndFusedOps(Operation *rootOp,
   }
 }
 
+/// Tiling of `tensor.pad` operation generates
+///
+/// ```mlir
+/// scf.if {
+///   ...
+/// } else {
+///    tensor.pad
+/// }
+/// ```
+///
+/// For IREEs use case we dont need this. So this folds away the `if` condition.
+/// Note this is a fairly hacky workaround, but the current pad operation
+/// semantics force us down this path.
+static FailureOr<tensor::PadOp> foldIfGeneratedFromPadding(
+    RewriterBase &rewriter, tensor::PadOp untiledPadOp,
+    tensor::PadOp tiledPadOp) {
+  auto ifOp = dyn_cast<scf::IfOp>(tiledPadOp->getParentOp());
+  if (!ifOp) {
+    return failure();
+  };
+  Block *block = tiledPadOp->getBlock();
+  Operation *terminator = block->getTerminator();
+  ValueRange results = terminator->getOperands();
+  rewriter.inlineBlockBefore(block, ifOp, /*blockArgs=*/{});
+  rewriter.replaceOp(ifOp, results);
+  rewriter.eraseOp(terminator);
+  return tiledPadOp;
+}
+
 /// This pass starts with the last TilingInterface operation, tiles the op and
 /// fuses its producers recursively. The `tilingLevel` must be specified. It
 /// picks the `tilingLevel`-th list as tiling sizes from lowering_config.
@@ -83,6 +112,17 @@ LogicalResult applyTileAndFuse(RewriterBase &rewriter, Operation *rootOp,
   }
   yieldedValuesToOrigValues.append(rootOp->result_begin(),
                                    rootOp->result_end());
+
+  // WAR for `if` ops generating `scf.if` operations.
+  if (auto rootPadOp = dyn_cast<tensor::PadOp>(rootOp)) {
+    assert(tilingResult->tiledOps.size() == 1 &&
+           "expected tiling of `pad` op to return only one operation");
+    FailureOr<Operation *> replacementTiledOp = foldIfGeneratedFromPadding(
+        rewriter, rootPadOp, cast<tensor::PadOp>(tilingResult->tiledOps[0]));
+    if (!failed(replacementTiledOp)) {
+      tilingResult->tiledOps[0] = replacementTiledOp.value();
+    }
+  }
   tiledOps.append(tilingResult->tiledOps);
 
   // 2. Tiling each operation results in generation of slices. The source of
diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/Passes.cpp b/compiler/src/iree/compiler/Codegen/LLVMCPU/Passes.cpp
index a19f23bcc9db..d67abaabfc62 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMCPU/Passes.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/Passes.cpp
@@ -125,8 +125,7 @@ static void addBufferizePasses(OpPassManager &passManager) {
       createEraseHALDescriptorTypeFromMemRefPass());
 }
 
-static void addTileAndDistributePasses(
-    OpPassManager &pm, bool useFuseTensorPadWithConsumerPass = true) {
+static void addTileAndDistributePasses(OpPassManager &pm) {
   pm.addPass(createTileAndDistributeToWorkgroupsPass());
   auto &nestedModulePM = pm.nest<ModuleOp>();
   nestedModulePM.addNestedPass<func::FuncOp>(
@@ -135,10 +134,10 @@ static void addTileAndDistributePasses(
       createFoldAffineMinInDistributedLoopsPass());
   nestedModulePM.addPass(createCanonicalizerPass());
   nestedModulePM.addPass(createCSEPass());
-  if (clEnablePadConsumerFusion && useFuseTensorPadWithConsumerPass) {
-    nestedModulePM.addNestedPass<func::FuncOp>(
-        createFuseTensorPadWithConsumerPass());
-  }
+  nestedModulePM.addNestedPass<func::FuncOp>(
+      createFuseTensorPadWithConsumerPass());
+  nestedModulePM.addNestedPass<func::FuncOp>(
+      createConcretizePadResultShapePass());
   nestedModulePM.addNestedPass<func::FuncOp>(
       IREE::LinalgExt::createTileAndDecomposeAttentionPass());
   nestedModulePM.addNestedPass<func::FuncOp>(
@@ -353,8 +352,7 @@ void addCPUBufferOpsTileAndVectorizePipeline(OpPassManager &passManager,
 
 void addDoubleTilingPadExpertPassPipeline(OpPassManager &passManager,
                                           bool enableVectorMasking) {
-  addTileAndDistributePasses(passManager,
-                             /*useFuseTensorPadWithConsumerPass=*/false);
+  addTileAndDistributePasses(passManager);
 
   OpPassManager &nestedModulePM = passManager.nest<ModuleOp>();
   nestedModulePM.addNestedPass<func::FuncOp>(createLLVMCPUTileAndFusePass(
@@ -394,8 +392,7 @@ void addDoubleTilingPadExpertPassPipeline(OpPassManager &passManager,
 
 void addVMVXDefaultPassPipeline(OpPassManager &passManager,
                                 bool enableMicrokernels) {
-  addTileAndDistributePasses(passManager,
-                             /*useFuseTensorPadWithConsumerPass=*/false);
+  addTileAndDistributePasses(passManager);
 
   if (enableMicrokernels) {
     passManager.nest<ModuleOp>().addPass(createLLVMCPULowerToUKernelsPass());
@@ -441,6 +438,10 @@ void addMultiTilingExpertPassPipeline(OpPassManager &passManager,
 
   for (int64_t i = 1; i < numLevels - 1; ++i) {
     nestedModulePM.addNestedPass<func::FuncOp>(createLLVMCPUTileAndFusePass(i));
+    nestedModulePM.addNestedPass<func::FuncOp>(
+        createFuseTensorPadWithConsumerPass());
+    nestedModulePM.addNestedPass<func::FuncOp>(
+        createConcretizePadResultShapePass());
   }
   // Run SplitReductionPass before the final reduction Fuse pass, because
   // SplitReductionPass takes care of banked-tiling.
@@ -449,13 +450,10 @@ void addMultiTilingExpertPassPipeline(OpPassManager &passManager,
   nestedModulePM.addNestedPass<func::FuncOp>(
       createLLVMCPUTilePass(numLevels - 1));
 
-  if (clEnablePadConsumerFusion) {
-    nestedModulePM.addNestedPass<func::FuncOp>(
-        createFuseTensorPadWithConsumerPass());
-    nestedModulePM.addNestedPass<func::FuncOp>(
-        createConcretizePadResultShapePass());
-    nestedModulePM.addNestedPass<func::FuncOp>(createVectorizePadPass());
-  }
+  nestedModulePM.addNestedPass<func::FuncOp>(
+      createFuseTensorPadWithConsumerPass());
+  nestedModulePM.addNestedPass<func::FuncOp>(
+      createConcretizePadResultShapePass());
 
   if (enablePeeling) {
     nestedModulePM.addNestedPass<func::FuncOp>(createLLVMCPUPeelPass());
@@ -466,6 +464,8 @@ void addMultiTilingExpertPassPipeline(OpPassManager &passManager,
         createDecomposePackUnPackOpsPass());
     nestedModulePM.addNestedPass<func::FuncOp>(createCanonicalizerPass());
     nestedModulePM.addNestedPass<func::FuncOp>(createCSEPass());
+
+    nestedModulePM.addNestedPass<func::FuncOp>(createVectorizePadPass());
     LLVMCPUVectorizationPassOptions options;
     options.enableVectorMasking = enableVectorMasking;
     // TODO(#13036): Re-enable once debugged.
@@ -503,26 +503,23 @@ void addConvTileAndDecomposeExpertPassPipeline(OpPassManager &passManager,
 
   nestedModulePM.addNestedPass<func::FuncOp>(createLLVMCPUTileAndFusePass(
       static_cast<int64_t>(TilingLevel::ParallelTiles)));
-  if (clEnablePadConsumerFusion) {
-    nestedModulePM.addNestedPass<func::FuncOp>(
-        createFuseTensorPadWithConsumerPass());
-    nestedModulePM.addNestedPass<func::FuncOp>(
-        createConcretizePadResultShapePass());
-  }
+  nestedModulePM.addNestedPass<func::FuncOp>(
+      createFuseTensorPadWithConsumerPass());
+  nestedModulePM.addNestedPass<func::FuncOp>(
+      createConcretizePadResultShapePass());
+
   nestedModulePM.addNestedPass<func::FuncOp>(
       createLLVMCPUTilePass(static_cast<int64_t>(TilingLevel::ReductionTiles)));
   nestedModulePM.addNestedPass<func::FuncOp>(
       createDecomposeConvolutionToLowerDimOpsPass());
 
-  if (clEnablePadConsumerFusion) {
-    nestedModulePM.addNestedPass<func::FuncOp>(
-        createFuseTensorPadWithConsumerPass());
-    nestedModulePM.addNestedPass<func::FuncOp>(
-        createConcretizePadResultShapePass());
-    nestedModulePM.addNestedPass<func::FuncOp>(createVectorizePadPass());
-  }
+  nestedModulePM.addNestedPass<func::FuncOp>(
+      createFuseTensorPadWithConsumerPass());
+  nestedModulePM.addNestedPass<func::FuncOp>(
+      createConcretizePadResultShapePass());
 
   {
+    nestedModulePM.addNestedPass<func::FuncOp>(createVectorizePadPass());
     LLVMCPUVectorizationPassOptions options;
     options.enableVectorMasking = enableVectorMasking;
     options.vectorizePadding = true;
@@ -537,7 +534,7 @@ void addConvTileAndDecomposeExpertPassPipeline(OpPassManager &passManager,
   nestedModulePM.addNestedPass<func::FuncOp>(createCSEPass());
   nestedModulePM.addNestedPass<func::FuncOp>(createCanonicalizerPass());
   nestedModulePM.addNestedPass<func::FuncOp>(
-      createOptimizeVectorTransferPass(/*flatten=*/false));
+      createOptimizeVectorTransferPass(/*flatten=*/true));
   addBufferizePasses(nestedModulePM);
 
   // Run IREE specific passes before vector lowering expert.
diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/BUILD.bazel b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/BUILD.bazel
index 0ef2e57d73fe..87a2f94226fd 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/BUILD.bazel
+++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/BUILD.bazel
@@ -42,6 +42,7 @@ iree_lit_test_suite(
             "materialize_vmvx_launch_configuration.mlir",
             "materialize_x86_64_launch_configuration.mlir",
             "pad_conv_pipeline_tests.mlir",
+            "pad_pipeline_tests.mlir",
             "peel.mlir",
             "peel_and_vectorize.mlir",
             "pipeline_tests.mlir",
diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/CMakeLists.txt b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/CMakeLists.txt
index f3476c3ee330..4ec3d6e52ff8 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/CMakeLists.txt
+++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/CMakeLists.txt
@@ -37,6 +37,7 @@ iree_lit_test_suite(
     "materialize_vmvx_launch_configuration.mlir"
     "materialize_x86_64_launch_configuration.mlir"
     "pad_conv_pipeline_tests.mlir"
+    "pad_pipeline_tests.mlir"
     "peel.mlir"
     "peel_and_vectorize.mlir"
     "pipeline_tests.mlir"
diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/pad_pipeline_tests.mlir b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/pad_pipeline_tests.mlir
new file mode 100644
index 000000000000..3a90248e3b76
--- /dev/null
+++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/pad_pipeline_tests.mlir
@@ -0,0 +1,220 @@
+// RUN: iree-opt --pass-pipeline="builtin.module(hal.executable(hal.executable.variant(iree-llvmcpu-lower-executable-target)))" --split-input-file %s | FileCheck %s
+
+hal.executable private @pad_only {
+  hal.executable.variant public @embedded_elf_x86_64, target = <"llvm-cpu", "embedded-elf-x86_64", {
+      cpu = "generic", cpu_features = "",
+      data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128",
+      native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}> {
+    hal.executable.export public @pad_only_dispatch ordinal(0)
+        layout(#hal.pipeline.layout<push_constants = 0,
+            sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>) {
+    ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index, %arg4: index):
+      %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg2, %arg3, %arg4
+      hal.return %x, %y, %z : index, index, index
+    }
+    builtin.module {
+      func.func @pad_only_dispatch() {
+        %c634816 = arith.constant 634816 : index
+        %c3846080 = arith.constant 3846080 : index
+        %cst = arith.constant 0.000000e+00 : f32
+        %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c634816) flags(ReadOnly)
+            : !flow.dispatch.tensor<readonly:tensor<1x112x112x64xf32>>
+        %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c3846080)
+            : !flow.dispatch.tensor<writeonly:tensor<1x114x114x64xf32>>
+        %2 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [1, 112, 112, 64], strides = [1, 1, 1, 1]
+            : !flow.dispatch.tensor<readonly:tensor<1x112x112x64xf32>> -> tensor<1x112x112x64xf32>
+        %padded = tensor.pad %2 low[0, 1, 1, 0] high[0, 1, 1, 0] {
+        ^bb0(%arg0: index, %arg1: index, %arg2: index, %arg3: index):
+          tensor.yield %cst : f32
+        } : tensor<1x112x112x64xf32> to tensor<1x114x114x64xf32>
+        flow.dispatch.tensor.store %padded, %1, offsets = [0, 0, 0, 0], sizes = [1, 114, 114, 64], strides = [1, 1, 1, 1]
+            : tensor<1x114x114x64xf32> -> !flow.dispatch.tensor<writeonly:tensor<1x114x114x64xf32>>
+        return
+      }
+    }
+  }
+}
+// CHECK-LABEL: func @pad_only_dispatch()
+//       CHECK:   %[[INPUT:.+]] = hal.interface.binding.subspan {{.+}} : memref<1x112x112x64xf32
+//       CHECK:   %[[OUTPUT:.+]] = hal.interface.binding.subspan {{.+}} : memref<1x114x114x64xf32
+//       CHECK:   scf.for
+//       CHECK:     scf.for
+//       CHECK:       scf.for
+//       CHECK:         scf.if
+//       CHECK:           %[[OUTPUT_SUBVIEW_IF:.+]] = memref.subview %[[OUTPUT]]
+//       CHECK:           linalg.generic
+//  CHECK-SAME:               outs(%[[OUTPUT_SUBVIEW_IF]]
+//       CHECK:         else
+//       CHECK:           %[[INPUT_SUBVIEW:.+]] = memref.subview %[[INPUT]]
+//       CHECK:           %[[OUTPUT_SUBVIEW:.+]] = memref.subview %[[OUTPUT]]
+//       CHECK:           scf.for
+//       CHECK:             scf.for
+//       CHECK:               scf.for
+//       CHECK:                 %[[OUTPUT_SLICE:.+]] = memref.subview %[[OUTPUT_SUBVIEW]]
+//       CHECK:                 %[[RESULT_VEC:.+]] = scf.if %{{.+}} -> (vector<4xf32>) {
+//       CHECK:                   %[[VEC_LOAD:.+]] = vector.load %[[INPUT_SUBVIEW]]
+//       CHECK:                   scf.yield %[[VEC_LOAD]]
+//       CHECK:                 }
+//       CHECK:                 vector.store %[[RESULT_VEC]], %[[OUTPUT_SLICE]]
+
+// -----
+
+hal.executable private @pad_with_producer {
+  hal.executable.variant public @embedded_elf_x86_64, target = <"llvm-cpu", "embedded-elf-x86_64", {
+      cpu = "generic", cpu_features = "",
+      data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128",
+      native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}> {
+    hal.executable.export public @pad_with_producer_dispatch ordinal(0)
+        layout(#hal.pipeline.layout<push_constants = 0,
+            sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer, ReadOnly>, <2, storage_buffer>]>]>) {
+    ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index, %arg4: index):
+      %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg2, %arg3, %arg4
+      hal.return %x, %y, %z : index, index, index
+    }
+    builtin.module {
+      func.func @pad_with_producer_dispatch() {
+        %c802816 = arith.constant 802816 : index
+        %c72545728 = arith.constant 72545728 : index
+        %c72676800 = arith.constant 72676800 : index
+        %c0 = arith.constant 0 : index
+        %cst = arith.constant 1.001000e-05 : f32
+        %cst_0 = arith.constant 0.000000e+00 : f32
+        %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c802816) flags(ReadOnly)
+            : !flow.dispatch.tensor<readonly:tensor<1x56x56x256xf32>>
+        %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c72545728) flags(ReadOnly)
+            : !flow.dispatch.tensor<readonly:tensor<1x1x256x128xf32>>
+        %2 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c72676800) flags(ReadOnly)
+            : !flow.dispatch.tensor<readonly:tensor<128xf32>>
+        %7 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0)
+            : !flow.dispatch.tensor<writeonly:tensor<1x30x30x128xf32>>
+        %8 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [1, 56, 56, 256], strides = [1, 1, 1, 1]
+            : !flow.dispatch.tensor<readonly:tensor<1x56x56x256xf32>> -> tensor<1x56x56x256xf32>
+        %9 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [1, 1, 256, 128], strides = [1, 1, 1, 1]
+            : !flow.dispatch.tensor<readonly:tensor<1x1x256x128xf32>> -> tensor<1x1x256x128xf32>
+        %10 = flow.dispatch.tensor.load %2, offsets = [0], sizes = [128], strides = [1] : !flow.dispatch.tensor<readonly:tensor<128xf32>> -> tensor<128xf32>
+        %15 = tensor.empty() : tensor<1x28x28x128xf32>
+        %16 = linalg.fill ins(%cst_0 : f32) outs(%15 : tensor<1x28x28x128xf32>) -> tensor<1x28x28x128xf32>
+        %17 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>}
+            ins(%8, %9 : tensor<1x56x56x256xf32>, tensor<1x1x256x128xf32>) outs(%16 : tensor<1x28x28x128xf32>) -> tensor<1x28x28x128xf32>
+        %18 = linalg.generic {
+            indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>],
+            iterator_types = ["parallel", "parallel", "parallel", "parallel"]}
+            ins(%17, %10 : tensor<1x28x28x128xf32>, tensor<128xf32>) outs(%15 : tensor<1x28x28x128xf32>) {
+          ^bb0(%in: f32, %in_1: f32, %out: f32):
+            %20 = arith.addf %in, %in_1 : f32
+            linalg.yield %20 : f32
+          } -> tensor<1x28x28x128xf32>
+        %padded = tensor.pad %18 low[0, 1, 1, 0] high[0, 1, 1, 0] {
+          ^bb0(%arg0: index, %arg1: index, %arg2: index, %arg3: index):
+            tensor.yield %cst_0 : f32
+          } : tensor<1x28x28x128xf32> to tensor<1x30x30x128xf32>
+        flow.dispatch.tensor.store %padded, %7, offsets = [0, 0, 0, 0], sizes = [1, 30, 30, 128], strides = [1, 1, 1, 1]
+            : tensor<1x30x30x128xf32> -> !flow.dispatch.tensor<writeonly:tensor<1x30x30x128xf32>>
+        return
+      }
+    }
+  }
+}
+// CHECK-LABEL: func @pad_with_producer_dispatch()
+//       CHECK:   %[[INPUT:.+]] = hal.interface.binding.subspan {{.+}} : memref<1x56x56x256xf32
+//       CHECK:   %[[FILTER:.+]] = hal.interface.binding.subspan {{.+}} : memref<1x1x256x128xf32
+//       CHECK:   %[[BIAS:.+]] = hal.interface.binding.subspan {{.+}} : memref<128xf32
+//       CHECK:   %[[OUTPUT:.+]] = hal.interface.binding.subspan {{.+}} : memref<1x30x30x128xf32
+//       CHECK:   scf.for
+//       CHECK:     scf.for
+//       CHECK:       scf.if
+//       CHECK:       else
+//   CHECK-DAG:         %[[INPUT_SUBVIEW:.+]] = memref.subview %[[INPUT]]
+//   CHECK-DAG:         %[[FILTER_SUBVIEW:.+]] = memref.subview %[[FILTER]]
+//   CHECK-DAG:         %[[BIAS_SUBVIEW:.+]] = memref.subview %[[BIAS]]
+//   CHECK-DAG:         %[[OUTPUT_SUBVIEW:.+]] = memref.subview %[[OUTPUT]]
+//       CHECK:         scf.for
+//       CHECK:           scf.for
+//   CHECK-DAG:             %[[INPUT_SLICE:.+]] = memref.subview %[[INPUT_SUBVIEW]]
+//   CHECK-DAG:             %[[BIAS_ALLOC:.+]] = memref.alloca
+//       CHECK:               scf.for
+//       CHECK:               %[[FILTER_SLICE:.+]] = memref.subview %[[FILTER_SUBVIEW]]
+//       CHECK:               %[[FILL_ALLOC:.+]] = memref.alloca
+//       CHECK:               linalg.fill
+//  CHECK-SAME:                   outs(%[[FILL_ALLOC]]
+//       CHECK:               %[[CONV_OUTPUT:.+]] = memref.subview %[[FILL_ALLOC]]
+//       CHECK:               scf.for
+//       CHECK:                 %[[CONV_INPUT:.+]] = memref.subview %[[INPUT_SLICE]]
+//       CHECK:                 %[[CONV_FILTER:.+]] = memref.subview %[[FILTER_SLICE]]
+//       CHECK:                 linalg.conv_2d_nhwc_hwcf
+//  CHECK-SAME:                     ins(%[[CONV_INPUT]], %[[CONV_FILTER]] :
+//  CHECK-SAME:                     outs(%[[CONV_OUTPUT]] :
+//       CHECK:               %[[BIAS_INPUT:.+]] = memref.subview %[[BIAS_SUBVIEW]]
+//       CHECK:               linalg.generic
+//  CHECK-SAME:                   ins(%[[CONV_OUTPUT]], %[[BIAS_INPUT]] :
+//  CHECK-SAME:                   outs(%[[BIAS_ALLOC]]
+//       CHECK:               %[[OUTPUT_SLICE:.+]] = memref.subview %[[OUTPUT_SUBVIEW]]
+//       CHECK:               linalg.fill ins(%{{.+}} :   f32) outs(%[[OUTPUT_SLICE]]
+//       CHECK:               %[[INTERIOR_SLICE:.+]] = memref.subview %[[OUTPUT_SLICE]]
+//       CHECK:               linalg.generic
+//  CHECK-SAME:                   ins(%[[BIAS_ALLOC]] :
+//  CHECK-SAME:                   outs(%[[INTERIOR_SLICE]] :
+
+// -----
+
+hal.executable private @pad_consumer_fusion {
+  hal.executable.variant public @embedded_elf_x86_64, target = <"llvm-cpu", "embedded-elf-x86_64", {
+      cpu = "generic", cpu_features = "",
+      data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128",
+      native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}> {
+    hal.executable.export public @pad_consumer_fusion_dispatch ordinal(0)
+        layout(#hal.pipeline.layout<push_constants = 0,
+            sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer, ReadOnly>, <2, storage_buffer>]>]>) {
+    ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index):
+      %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg2, %arg3, %arg4, %arg5, %arg6, %arg7
+      hal.return %x, %y, %z : index, index, index
+    }
+    builtin.module {
+      func.func @pad_consumer_fusion_dispatch() {
+        %c0 = arith.constant 0 : index
+        %cst = arith.constant 0.000000e+00 : f32
+        %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly)
+            : !flow.dispatch.tensor<readonly:tensor<1x14x14x256xf32>>
+        %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly)
+            : !flow.dispatch.tensor<readonly:tensor<3x3x256x256xf32>>
+        %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0)
+            : !flow.dispatch.tensor<readwrite:tensor<1x14x14x256xf32>>
+        %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [1, 14, 14, 256], strides = [1, 1, 1, 1]
+            : !flow.dispatch.tensor<readonly:tensor<1x14x14x256xf32>> -> tensor<1x14x14x256xf32>
+        %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [3, 3, 256, 256], strides = [1, 1, 1, 1]
+            : !flow.dispatch.tensor<readonly:tensor<3x3x256x256xf32>> -> tensor<3x3x256x256xf32>
+        %5 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0], sizes = [1, 14, 14, 256], strides = [1, 1, 1, 1]
+            : !flow.dispatch.tensor<readwrite:tensor<1x14x14x256xf32>> -> tensor<1x14x14x256xf32>
+        %padded = tensor.pad %3 low[0, 1, 1, 0] high[0, 1, 1, 0] {
+        ^bb0(%arg0: index, %arg1: index, %arg2: index, %arg3: index):
+          tensor.yield %cst : f32
+        } : tensor<1x14x14x256xf32> to tensor<1x16x16x256xf32>
+        %6 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>}
+            ins(%padded, %4 : tensor<1x16x16x256xf32>, tensor<3x3x256x256xf32>) outs(%5 : tensor<1x14x14x256xf32>) -> tensor<1x14x14x256xf32>
+        flow.dispatch.tensor.store %6, %2, offsets = [0, 0, 0, 0], sizes = [1, 14, 14, 256], strides = [1, 1, 1, 1]
+            : tensor<1x14x14x256xf32> -> !flow.dispatch.tensor<readwrite:tensor<1x14x14x256xf32>>
+        return
+      }
+    }
+  }
+}
+//   CHECK-LABEL: func @pad_consumer_fusion_dispatch()
+//         CHECK:   %[[INPUT:.+]] = hal.interface.binding.subspan {{.+}} : memref<1x14x14x256xf32>
+//         CHECK:   %[[FILTER:.+]] = hal.interface.binding.subspan {{.+}} : memref<3x3x256x256xf32>
+//         CHECK:   %[[OUTPUT:.+]] = hal.interface.binding.subspan {{.+}} : memref<1x14x14x256xf32>
+//     CHECK-DAG:   %[[FILTER_SUBVIEW:.+]] = memref.subview %[[FILTER]]
+//     CHECK-DAG:   %[[OUTPUT_SUBVIEW:.+]] = memref.subview %[[OUTPUT]]
+//         CHECK:   scf.for
+//         CHECK:     scf.for
+//         CHECK:       scf.for
+//         CHECK:         %[[OUTPUT_SUBVIEW_0:.+]] = memref.subview %[[OUTPUT_SUBVIEW]]
+//         CHECK:         scf.for
+//         CHECK:           scf.for
+//         CHECK:             scf.for
+// CHECK-COUNT-7:               vector.load %[[INPUT]]
+// CHECK-COUNT-8:               vector.load %[[FILTER_SUBVIEW]]
+// CHECK-COUNT-8:               vector.outerproduct
+//         CHECK:               scf.yield
+//         CHECK:             scf.yield
+//         CHECK:           scf.yield
+// CHECK-COUNT-7:         vector.store %{{.+}}, %[[OUTPUT_SUBVIEW_0]]
diff --git a/compiler/src/iree/compiler/Codegen/Utils/Utils.cpp b/compiler/src/iree/compiler/Codegen/Utils/Utils.cpp
index 45f7634c5a2c..10a41f417a5f 100644
--- a/compiler/src/iree/compiler/Codegen/Utils/Utils.cpp
+++ b/compiler/src/iree/compiler/Codegen/Utils/Utils.cpp
@@ -498,21 +498,12 @@ std::optional<LoopTilingAndDistributionInfo> isTiledAndDistributedLoop(
 }
 
 SmallVector<Operation *> getComputeOps(func::FuncOp funcOp) {
-  Block *body = &funcOp.getFunctionBody().front();
-  auto forOps = body->getOps<scf::ForOp>();
-  while (!forOps.empty()) {
-    assert(llvm::hasSingleElement(forOps) &&
-           "expected dispatch function with single block");
-    scf::ForOp forOp = *(forOps.begin());
-    body = forOp.getBody();
-    forOps = body->getOps<scf::ForOp>();
-  }
   SmallVector<Operation *> computeOps;
-  for (Operation &op : *body) {
-    if (isa<TilingInterface, IREE::Codegen::UKernelOpInterface>(&op)) {
-      computeOps.push_back(&op);
+  funcOp.walk([&](Operation *op) {
+    if (isa<TilingInterface, IREE::Codegen::UKernelOpInterface>(op)) {
+      computeOps.push_back(op);
     }
-  }
+  });
   return computeOps;
 }
 
diff --git a/compiler/src/iree/compiler/Tools/init_mlir_dialects.h b/compiler/src/iree/compiler/Tools/init_mlir_dialects.h
index 4cfc41d1ca9b..2b4f034e1ec3 100644
--- a/compiler/src/iree/compiler/Tools/init_mlir_dialects.h
+++ b/compiler/src/iree/compiler/Tools/init_mlir_dialects.h
@@ -72,7 +72,7 @@ inline void registerMlirDialects(DialectRegistry &registry) {
                   shape::ShapeDialect>();
   // clang-format on
   tensor::registerInferTypeOpInterfaceExternalModels(registry);
-  tensor::registerTilingInterfaceExternalModelsForPackUnPackOps(registry);
+  tensor::registerTilingInterfaceExternalModels(registry);
 
 #ifdef IREE_HAVE_C_OUTPUT_FORMAT
   registry.insert<emitc::EmitCDialect>();