iree-org · ThomasRaoux · Aug 29, 2022 · Aug 24, 2022 · Aug 26, 2022 · MaheshRavishankar
@@ -95,9 +95,11 @@ static void populateVectorizationPatterns(RewritePatternSet &patterns) {
 /// Compute a vector size so that the numer of elements is equal to the flat
 /// workgroup size.
 static Optional<SmallVector<int64_t, 4>> getGPUNativeVectorSize(
-    Operation *op, int64_t flatWorkgroupSize) {
+    Operation *op, int64_t flatWorkgroupSize,
+    const llvm::SmallDenseSet<VectorTransferOpInterface> &opsToIgnore) {
   auto vt = dyn_cast<VectorTransferOpInterface>(op);
   if (!vt) return llvm::None;
+  if (opsToIgnore.count(vt)) return llvm::None;
   if (!vt.permutation_map().isMinorIdentity()) return llvm::None;
   ArrayRef<int64_t> shape = vt.getVectorType().getShape();
   int targetVectorSize =
@@ -121,10 +123,11 @@ static Optional<SmallVector<int64_t, 4>> getGPUNativeVectorSize(
   return unroll;
 }
 
-static void populateVectorUnrollPatterns(RewritePatternSet &patterns,
-                                         int64_t flatWorkgroupSize) {
-  auto getShape = [flatWorkgroupSize](Operation *op) {
-    return getGPUNativeVectorSize(op, flatWorkgroupSize);
+static void populateVectorUnrollPatterns(
+    RewritePatternSet &patterns, int64_t flatWorkgroupSize,
+    const llvm::SmallDenseSet<VectorTransferOpInterface> &opsToIgnore) {
+  auto getShape = [flatWorkgroupSize, &opsToIgnore](Operation *op) {
+    return getGPUNativeVectorSize(op, flatWorkgroupSize, opsToIgnore);
   };
   vector::populateVectorUnrollPatterns(
       patterns, vector::UnrollVectorOptions().setNativeShapeFn(getShape));
@@ -152,9 +155,13 @@ static Value createFlatId(func::FuncOp funcOp,
 }
 
 /// Distribute a transfer read operations on the given thread ids.
-static void distributeTransferRead(func::FuncOp funcOp, Value flatThreadId,
-                                   int64_t flatWorkgroupSize) {
+static void distributeTransferRead(
+    func::FuncOp funcOp, Value flatThreadId, int64_t flatWorkgroupSize,
+    const llvm::SmallDenseSet<VectorTransferOpInterface> &opsToIgnore) {
   funcOp.walk([&](vector::TransferReadOp readOp) {
+    if (opsToIgnore.count(
+            cast<VectorTransferOpInterface>(readOp.getOperation())))
+      return WalkResult::advance();
     OpBuilder b(readOp);
     Value id = flatThreadId;
     SmallVector<int64_t, 2> multiplier;
@@ -195,6 +202,40 @@ static void distributeTransferRead(func::FuncOp funcOp, Value flatThreadId,
       readOp.getResult().replaceAllUsesExcept(ops->insert.getResult(),
                                               extractOp);
     }
+    return WalkResult::advance();
+  });
+}
+
+/// Hoist allocations to the top of the loop if they have no dependencies.
+static void hoistAlloc(func::FuncOp funcOp) {
+  SmallVector<memref::AllocOp> allocs;
+  funcOp.walk([&](memref::AllocOp alloc) {
+    if (alloc.getOperands().empty()) allocs.push_back(alloc);
+  });
+  for (memref::AllocOp alloc : allocs) {
+    alloc->moveBefore(&(*funcOp.getBlocks().begin()),
+                      funcOp.getBlocks().begin()->begin());
+  }
+}
+
+/// We insert barriers conservatively, remove barriers that are obviously not
+/// needed.
+static void removeRedundantBarriers(func::FuncOp funcOp) {
+  funcOp.walk([](linalg::GenericOp copyOp) {
+    if (hasMarker(copyOp, getCopyToWorkgroupMemoryMarker())) {
+      Operation *prevOp = copyOp->getPrevNode();
+      SmallVector<Operation *> redundantBarriers;
+      while (prevOp) {
+        if (isa<gpu::BarrierOp>(prevOp))
+          redundantBarriers.push_back(prevOp);
+        else
+          break;
+        prevOp = prevOp->getPrevNode();
+      }
+      if (prevOp && hasMarker(prevOp, getCopyToWorkgroupMemoryMarker())) {
+        for (Operation *op : redundantBarriers) op->erase();
+      }
+    }
   });
 }
 
@@ -219,6 +260,11 @@ class GPUDistributeSharedMemoryCopyPass
         copiesToWorkgroupMem.push_back(copyOp);
     });
     if (copiesToWorkgroupMem.empty()) return;
+
+    // Step 0. First clean up the IR.
+    hoistAlloc(funcOp);
+    removeRedundantBarriers(funcOp);
+
     int64_t flatWorkgroupSize =
         workgroupSize[0] * workgroupSize[1] * workgroupSize[2];
     bool isAligned = llvm::all_of(
@@ -232,6 +278,11 @@ class GPUDistributeSharedMemoryCopyPass
                                                        targetVectorSize);
         });
     if (isAligned) {
+      // Ignore all the exisiting vector transfer ops.
+      llvm::SmallDenseSet<VectorTransferOpInterface> opsToIgnore;
+      funcOp.walk([&](VectorTransferOpInterface transferOp) {
+        opsToIgnore.insert(transferOp);
+      });
       // Step 1. Vectorize the shared memory copy.
       RewritePatternSet vectorizationPatterns(context);
       populateVectorizationPatterns(vectorizationPatterns);
@@ -245,14 +296,15 @@ class GPUDistributeSharedMemoryCopyPass
       // transfer op generated can. then be distributed to a single op of target
       // size.
       RewritePatternSet vectorUnrollPatterns(context);
-      populateVectorUnrollPatterns(vectorUnrollPatterns, flatWorkgroupSize);
+      populateVectorUnrollPatterns(vectorUnrollPatterns, flatWorkgroupSize,
+                                   opsToIgnore);
       if (failed(applyPatternsAndFoldGreedily(
               funcOp, std::move(vectorUnrollPatterns)))) {
         return signalPassFailure();
       }
       // Step 3. Distribute the transfer ops onto the flat ids.
       Value flatId = createFlatId(funcOp, workgroupSize);
-      distributeTransferRead(funcOp, flatId, flatWorkgroupSize);
+      distributeTransferRead(funcOp, flatId, flatWorkgroupSize, opsToIgnore);
       // Propagate vector distribution to the chain of ops.
       RewritePatternSet distributePatterns(context);
       vector::populatePropagateVectorDistributionPatterns(distributePatterns);

@@ -23,6 +23,7 @@ iree_compiler_cc_library(
         "LLVMGPULowerExecutableTarget.cpp",
         "LLVMGPUMultiBuffering.cpp",
         "LLVMGPUReduceBankConflicts.cpp",
+        "LLVMGPUTensorAlloc.cpp",
         "LLVMGPUTensorCoreVectorization.cpp",
         "LLVMGPUTileAndDistribute.cpp",
         "LLVMGPUTileTensor.cpp",
@@ -36,6 +37,7 @@ iree_compiler_cc_library(
     hdrs = [
         "ConvertToLLVM.h",
         "KernelConfig.h",
+        "TilingUtils.h",
     ],
     deps = [
         "//compiler/src/iree/compiler/Codegen:PassHeaders",
@@ -59,6 +61,7 @@ iree_compiler_cc_library(
         "@llvm-project//mlir:ArithmeticDialect",
         "@llvm-project//mlir:ArithmeticToLLVM",
         "@llvm-project//mlir:ArithmeticTransforms",
+        "@llvm-project//mlir:BufferizationDialect",
         "@llvm-project//mlir:ControlFlowToLLVM",
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:FuncToLLVM",

@@ -16,6 +16,7 @@ iree_cc_library(
   HDRS
     "ConvertToLLVM.h"
     "KernelConfig.h"
+    "TilingUtils.h"
   SRCS
     "ConvertToLLVM.cpp"
     "ConvertToNVVM.cpp"
@@ -25,6 +26,7 @@ iree_cc_library(
     "LLVMGPULowerExecutableTarget.cpp"
     "LLVMGPUMultiBuffering.cpp"
     "LLVMGPUReduceBankConflicts.cpp"
+    "LLVMGPUTensorAlloc.cpp"
     "LLVMGPUTensorCoreVectorization.cpp"
     "LLVMGPUTileAndDistribute.cpp"
     "LLVMGPUTileTensor.cpp"
@@ -47,6 +49,7 @@ iree_cc_library(
     MLIRArithmeticDialect
     MLIRArithmeticToLLVM
     MLIRArithmeticTransforms
+    MLIRBufferizationDialect
     MLIRControlFlowToLLVM
     MLIRFuncDialect
     MLIRFuncToLLVM

@@ -25,22 +25,6 @@
 namespace mlir {
 namespace iree_compiler {
 
-/// Clean up barriers if we have no shared memory allocations we expect to not
-/// need any barriers and remove them.
-static void cleanUpBarriers(func::FuncOp funcOp) {
-  bool hasAlloc = false;
-  SmallVector<Operation*> barriers;
-  funcOp.walk([&](Operation* op) {
-    if (isa<memref::AllocOp>(op))
-      hasAlloc = true;
-    else if (isa<gpu::BarrierOp>(op))
-      barriers.push_back(op);
-  });
-  if (!hasAlloc) {
-    for (Operation* op : barriers) op->erase();
-  }
-}
-
 namespace {
 struct LLVMGPUDistributePass
     : public LLVMGPUDistributeBase<LLVMGPUDistributePass> {
@@ -61,15 +45,12 @@ struct LLVMGPUDistributePass
     for (scf::ForeachThreadOp op : foreachOps) {
       IRRewriter rewriter(op->getContext());
       rewriter.setInsertionPoint(op);
-      if (failed(rewriteForeachThreadToGpu(op, workgroupSize, rewriter))) {
+      if (failed(
+              rewriteForeachThreadToGpu(op, workgroupSize, rewriter,
+                                        /*syncAfterDistributefalse=*/false))) {
         return signalPassFailure();
       }
     }
-
-    // Workaround, since we conservatively insert barrier ops, remove them if
-    // they are obviously not needed.
-    // TODO(thomasraoux): Improve barrier placement.
-    cleanUpBarriers(funcOp);
   }
 };
 }  // namespace

@@ -0,0 +1,72 @@
+// Copyright 2022 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree/compiler/Codegen/LLVMGPU/TilingUtils.h"
+#include "iree/compiler/Codegen/PassDetail.h"
+#include "iree/compiler/Codegen/Passes.h"
+#include "mlir/Dialect/Bufferization/IR/Bufferization.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Transforms/Passes.h"
+
+#define DEBUG_TYPE "iree-llvmgpu-alloc"
+
+namespace mlir {
+namespace iree_compiler {
+
+/// Filter to decide which ops need allocations.
+static bool filter(Operation *op) {
+  auto linalgOp = dyn_cast<linalg::LinalgOp>(op);
+  if (!linalgOp) return false;
+  // Can't promote dynamic shapes.
+  if (linalgOp.hasDynamicShape()) return false;
+  return linalg::isaContractionOpInterface(op) &&
+         linalgOp.getNumParallelLoops() >= 2 &&
+         linalgOp.getNumParallelLoops() <= 3;
+}
+
+namespace {
+struct LLVMGPUTensorAllocPass
+    : public LLVMGPUTensorAllocBase<LLVMGPUTensorAllocPass> {
+  void getDependentDialects(DialectRegistry &registry) const override {
+    registry.insert<bufferization::BufferizationDialect>();
+  }
+  void runOnOperation() override {
+    auto funcOp = getOperation();
+
+    // Tile the reduction first to reduce the alloc size.
+    if (failed(tileReduction(funcOp))) {
+      return signalPassFailure();
+    }
+
+    SmallVector<Operation *> opsToPromote;
+    funcOp.walk([&](Operation *op) {
+      if (filter(op)) opsToPromote.push_back(op);
+    });
+    for (Operation *op : opsToPromote) {
+      OpBuilder builder(op);
+      auto linalgOp = cast<linalg::LinalgOp>(op);
+      bufferization::BufferizationOptions options;
+      // Promote all the input operands.
+      for (auto operand : linalgOp.getInputOperands()) {
+        FailureOr<Value> ret = bufferization::allocateTensorForShapedValue(
+            builder, op->getLoc(), operand->get(), false, options, true);
+        if (failed(ret)) {
+          return signalPassFailure();
+        }
+        Value v = ret.getValue();
+        operand->get().replaceAllUsesExcept(v, v.getDefiningOp());
+      }
+    }
+  }
+};
+}  // namespace
+
+std::unique_ptr<OperationPass<func::FuncOp>> createLLVMGPUTensorAlloc() {
+  return std::make_unique<LLVMGPUTensorAllocPass>();
+}
+
+}  // namespace iree_compiler
+}  // namespace mlir
@@ -9,6 +9,7 @@
 #include "iree-dialects/Dialect/LinalgExt/Transforms/Transforms.h"
 #include "iree/compiler/Codegen/Dialect/LoweringConfig.h"
 #include "iree/compiler/Codegen/LLVMGPU/KernelConfig.h"
+#include "iree/compiler/Codegen/LLVMGPU/TilingUtils.h"
 #include "iree/compiler/Codegen/PassDetail.h"
 #include "iree/compiler/Codegen/Passes.h"
 #include "iree/compiler/Codegen/Transforms/Transforms.h"
@@ -273,27 +274,11 @@ struct LLVMGPUTileAndDistributePass
       propagateFillIntoPromotionAlloc(funcOp);
     }
 
-    {
-      // Tile again at the workgroup level since redution dimension were
-      // ignored. Dimensions already tiled will be ignore since we tile to the
-      // same size.
-      RewritePatternSet wgTilingPatterns(context);
-      populateTilingReductionPatterns(wgTilingPatterns);
-      if (failed(applyPatternsAndFoldGreedily(funcOp,
-                                              std::move(wgTilingPatterns)))) {
-        return signalPassFailure();
-      }
-    }
-
-    {
-      RewritePatternSet wgTilingCanonicalizationPatterns =
-          linalg::getLinalgTilingCanonicalizationPatterns(context);
-      populateAffineMinSCFCanonicalizationPattern(
-          wgTilingCanonicalizationPatterns);
-      if (failed(applyPatternsAndFoldGreedily(
-              funcOp, std::move(wgTilingCanonicalizationPatterns)))) {
-        return signalPassFailure();
-      }
+    // Tile again at the workgroup level since redution dimension were
+    // ignored. Dimensions already tiled will be ignore since we tile to the
+    // same size.
+    if (failed(tileReduction(funcOp))) {
+      return signalPassFailure();
     }
 
     LLVM_DEBUG({

@@ -51,14 +51,14 @@ static void populateTilingReductionPatterns(RewritePatternSet &patterns) {
   linalg::LinalgTransformationFilter filter(
       ArrayRef<StringAttr>{
           StringAttr::get(context, getWorkgroupMemoryMarker())},
-      StringAttr::get(context, getVectorizeMarker()));
+      StringAttr::get(context, getWorkgroupKTiledMarker()));
   filter.setMatchByDefault();
   linalg::TilingPatterns<linalg::MatmulOp, linalg::BatchMatmulOp,
                          linalg::GenericOp>::insert(patterns, tilingOptions,
                                                     filter);
 }
 
-static LogicalResult tileReduction(func::FuncOp funcOp) {
+LogicalResult tileReduction(func::FuncOp funcOp) {
   {
     // Tile again at the workgroup level since redution dimension were
     // ignored. Dimensions already tiled will be ignore since we tile to the
@@ -92,24 +92,22 @@ static LogicalResult tileParallelDims(func::FuncOp funcOp,
   std::array<int64_t, 3> elementPerWorkgroup = {
       distributeToWarp ? workgroupSize[0] / kWarpSize : workgroupSize[0],
       workgroupSize[1], workgroupSize[2]};
-  SmallVector<Operation *> computeOps;
-  SmallVector<LoopTilingAndDistributionInfo> tiledLoops;
-  if (failed(getComputeOps(funcOp, computeOps, tiledLoops))) {
-    return funcOp.emitOpError("failed to get compute ops");
-  }
+  SmallVector<TilingInterface> computeOps;
+  funcOp.walk([&](TilingInterface op) { computeOps.push_back(op); });
 
-  for (Operation *op : computeOps) {
-    auto tilingOp = dyn_cast<TilingInterface>(op);
-    if (!tilingOp) continue;
+  for (TilingInterface tilingOp : computeOps) {
     size_t numLoops = 0;
     for (auto type : tilingOp.getLoopIteratorTypes()) {
       if (type == getParallelIteratorTypeName()) numLoops++;
     }
-    IRRewriter rewriter(op->getContext());
-    rewriter.setInsertionPoint(op);
-    auto interfaceOp = cast<PartitionableLoopsInterface>(*op);
+    IRRewriter rewriter(tilingOp->getContext());
+    rewriter.setInsertionPoint(tilingOp);
+    auto interfaceOp =
+        cast<PartitionableLoopsInterface>(*tilingOp.getOperation());
     auto partitionedLoops =
         interfaceOp.getPartitionableLoops(kNumMaxParallelDims);
+    // If there are no dimensions to tile skip the transformation.
+    if (partitionedLoops.empty()) continue;
     SmallVector<OpFoldResult> numThreads(numLoops, rewriter.getIndexAttr(0));
     int64_t id = 0;
     int64_t threadId = 0;
@@ -126,7 +124,7 @@ static LogicalResult tileParallelDims(func::FuncOp funcOp,
 
     auto tilingResult =
         linalg::tileToForeachThreadOp(rewriter, tilingOp, numThreads, idDims);
-    rewriter.replaceOp(op, tilingResult->tileOp->getResults());
+    rewriter.replaceOp(tilingOp, tilingResult->tileOp->getResults());
   }
   return success();
 }