From 3536411f7b2fd08bd7a544a9e91c54690738f43d Mon Sep 17 00:00:00 2001 From: athangam Date: Sun, 10 Nov 2024 22:57:13 -0800 Subject: [PATCH 01/14] This pass hoists vector transfer operations of resultant matrix outside the reduction/k loop --- include/TPP/Passes.td | 12 ++ lib/TPP/Transforms/CMakeLists.txt | 1 + lib/TPP/Transforms/HoistVectorTransfers.cpp | 154 ++++++++++++++++++ .../hoist-vector-transfer-brgemm.mlir | 136 ++++++++++++++++ 4 files changed, 303 insertions(+) create mode 100644 lib/TPP/Transforms/HoistVectorTransfers.cpp create mode 100644 test/Integration/hoist-vector-transfer-brgemm.mlir diff --git a/include/TPP/Passes.td b/include/TPP/Passes.td index ad24532fa..586828a90 100644 --- a/include/TPP/Passes.td +++ b/include/TPP/Passes.td @@ -53,6 +53,18 @@ def VectorizationPass : Pass<"vectorization-pass", let dependentDialects = [ "memref::MemRefDialect", "linalg::LinalgDialect", "vector::VectorDialect" ]; } + + +def HoistVectorTransfers : Pass<"hoist-vector-transfer"> { + let summary = "Hoist vector transfer operation outside of reduction and k loop"; + let description = [{ + Hoists the vector transfer read and write operations of 'C' matrix outside the reduction and k loop for an brgemm operation. This pass should be applied after the BrgemmLinalgTiling Pass. + }]; + let dependentDialects = [ "vector::VectorDialect" ]; +} + + + def VectorContractToOuterproduct : Pass< "vector-contract-to-outerproduct"> { let summary = "Perform outerproduct lowering of vector contraction ops"; diff --git a/lib/TPP/Transforms/CMakeLists.txt b/lib/TPP/Transforms/CMakeLists.txt index b5e27f6c9..9f9442fee 100644 --- a/lib/TPP/Transforms/CMakeLists.txt +++ b/lib/TPP/Transforms/CMakeLists.txt @@ -27,6 +27,7 @@ add_mlir_library(TPPTransforms Vectorization.cpp SplitReductionDim.cpp VectorContractToOuterproduct.cpp + HoistVectorTransfers.cpp ADDITIONAL_HEADER_DIRS ${PROJECT_SOURCE_DIR}/include/TPP diff --git a/lib/TPP/Transforms/HoistVectorTransfers.cpp b/lib/TPP/Transforms/HoistVectorTransfers.cpp new file mode 100644 index 000000000..7d55cae3e --- /dev/null +++ b/lib/TPP/Transforms/HoistVectorTransfers.cpp @@ -0,0 +1,154 @@ +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file implements tile configuration hoisting on parallel loops. +// +//===----------------------------------------------------------------------===// +#include "mlir/Dialect/Func/IR/FuncOps.h" +#include "mlir/Dialect/Linalg/Transforms/Transforms.h" +#include "mlir/Dialect/MemRef/IR/MemRef.h" +#include "mlir/Dialect/SCF/IR/SCF.h" +#include "mlir/Dialect/Vector/Transforms/LoweringPatterns.h" +#include "mlir/Dialect/Vector/Transforms/VectorRewritePatterns.h" +#include "mlir/Pass/Pass.h" +#include "mlir/Transforms/GreedyPatternRewriteDriver.h" +#include "mlir/Dialect/Arith/IR/Arith.h" +#include "mlir/IR/BuiltinAttributes.h" +#include "mlir/IR/IRMapping.h" +#include +namespace mlir { +namespace tpp { +#define GEN_PASS_DEF_HOISTVECTORTRANSFERS +#include "TPP/Passes.h.inc" +} // namespace tpp +} // namespace mlir + +using namespace mlir; +using namespace vector; + +namespace mlir { +namespace tpp { + +struct HoistVectorTransferOp : OpRewritePattern { + using OpRewritePattern::OpRewritePattern; + + HoistVectorTransferOp(MLIRContext *ctx) + : OpRewritePattern(ctx) {} + + LogicalResult matchAndRewrite(vector::ContractionOp contractOp, + PatternRewriter &rewriter) const override { + //llvm::outs() << "The defining operation is: Arun" << "\n"; + // Code to hoist vector transfer read before the reduction and k loop + auto vectorReadOp = contractOp.getOperand(contractOp.getNumOperands()-1).getDefiningOp(); + if (vectorReadOp) { + auto subviewOp = vectorReadOp->getOperand(0).getDefiningOp(); + rewriter.setInsertionPointAfter(subviewOp); + auto retriveVectorReadOp = llvm::dyn_cast(vectorReadOp); + auto *cloneVectorReadOp = rewriter.clone(*retriveVectorReadOp); + contractOp.setOperand(contractOp.getNumOperands()-1, (*cloneVectorReadOp).getResult(0)); + retriveVectorReadOp.replaceAllUsesWith(cloneVectorReadOp); + + // Code to re-create the reduction and k loop with iter args to + auto *nextOp = (*cloneVectorReadOp).getNextNode(); + if (nextOp) { + auto vectorReadOpValue = (*cloneVectorReadOp).getResult(0); + auto oldReductionForOp = llvm::dyn_cast(*nextOp); + auto oldKForOp = llvm::dyn_cast(oldReductionForOp.getBody()->front()); + + rewriter.setInsertionPoint(oldReductionForOp); + auto newReductionForOp = rewriter.create( + oldReductionForOp.getLoc(), oldReductionForOp.getLowerBound(), oldReductionForOp.getUpperBound(), + oldReductionForOp.getStep(),ValueRange{vectorReadOpValue}, + [&](OpBuilder &rewriterNewReductionForOp, Location locNewReductionForOp, Value ivNewReductionForOp, + ValueRange iterArgsNewReductionForOp) { + auto newKForOp = rewriter.create( + oldKForOp.getLoc(), oldKForOp.getLowerBound(), oldKForOp.getUpperBound(), + oldKForOp.getStep(), iterArgsNewReductionForOp, + [&](OpBuilder &rewriterNewKForOp, Location locNewKForOp, Value ivNewKForOp, + ValueRange iterArgsNewKForOp) { + mlir::IRMapping mapper; + mapper.map(oldReductionForOp.getInductionVar(), ivNewReductionForOp); + mapper.map(oldKForOp.getInductionVar(), ivNewKForOp); + + for (auto [origArgReduction, newArgReduction] : + llvm::zip(oldReductionForOp.getRegionIterArgs(), iterArgsNewReductionForOp)) { + mapper.map(origArgReduction, newArgReduction); + } + + for (auto [origArgK, newArgK] : + llvm::zip(oldKForOp.getRegionIterArgs(), iterArgsNewKForOp)) { + mapper.map(origArgK, newArgK); + } + + for (auto &op : oldKForOp.getBody()->without_terminator()) { + rewriterNewKForOp.clone(op, mapper); + } + + rewriterNewKForOp.create(locNewKForOp, iterArgsNewKForOp); + + }); + rewriterNewReductionForOp.create(locNewReductionForOp, newKForOp.getResult(0)); + }); + + //Code to hoist vector transfer write after reduction loop and also to update the yield of k loop + auto newKForOp = llvm::dyn_cast(newReductionForOp.getBody()->front()); + Value newcontractOpValue; + mlir::vector::TransferWriteOp vectorWriteOperation; + mlir::Block *bodyBlock = newKForOp.getBody(); + for (auto &op : bodyBlock->getOperations()) { + if (auto vectorContractOp = llvm::dyn_cast(op)) { + vectorContractOp.setOperand(vectorContractOp.getNumOperands()-1, newKForOp.getRegionIterArgs()[0]); + newcontractOpValue = vectorContractOp.getResult(); + } + if (auto yieldOp = llvm::dyn_cast(op)) { + if ( newcontractOpValue != NULL) + yieldOp.setOperand(0, newcontractOpValue); + } + if (auto vectorWriteOp = llvm::dyn_cast(op)) { + vectorWriteOperation = vectorWriteOp; + } + } + + if (vectorWriteOperation != NULL) { + vectorWriteOperation.setOperand(0,newReductionForOp.getResult(0)); + vectorWriteOperation->moveBefore(oldReductionForOp); + } + + // Erase the vector contract operation + for (auto result : contractOp->getResults()) { + for (auto *userOp : result.getUsers()) { + userOp->erase(); + } + } + contractOp.erase(); + + } + } + return success(); + } +}; + + +void populateHoistVectorTransferPatterns(RewritePatternSet &patterns) { + patterns.add(patterns.getContext()); +} + +struct HoistVectorTransfers + : public impl::HoistVectorTransfersBase { + using HoistVectorTransfersBase::HoistVectorTransfersBase; + + void runOnOperation() override { + RewritePatternSet patterns(&getContext()); + populateHoistVectorTransferPatterns(patterns); + GreedyRewriteConfig config; + config.strictMode = GreedyRewriteStrictness::ExistingOps; + (void)applyPatternsAndFoldGreedily(getOperation(), std::move(patterns), + config); + } +}; +} // namespace tpp +} // namespace mlir diff --git a/test/Integration/hoist-vector-transfer-brgemm.mlir b/test/Integration/hoist-vector-transfer-brgemm.mlir new file mode 100644 index 000000000..4a6cac3b1 --- /dev/null +++ b/test/Integration/hoist-vector-transfer-brgemm.mlir @@ -0,0 +1,136 @@ +// RUN: tpp-opt %s | tpp-run -e chainedGEMM --entry-point-result=void -print > %t.1 +// RUN: tpp-opt %s --hoist-vector-transfer | tpp-run -e chainedGEMM --entry-point-result=void -print > %t.2 +// RUN: diff %t.1 %t.2 | FileCheck %s --check-prefix=DIFF-TWO --allow-empty + +// DIFF-TWO-NOT: {{.}} +#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)> +#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)> +#map2 = affine_map<(d0, d1, d2, d3) -> (d1, d2)> + + memref.global "private" constant @__constant_48x32x32xf32 : memref<48x32x32xf32> = dense<1.000000e+00> {alignment = 64 : i64} + func.func @chainedGEMM(%arg0: memref<8x48x32x32xf32>) -> memref<8x48x32x32xf32> { + %cst = arith.constant 0.000000e+00 : f32 + %cst_0 = arith.constant dense<0.000000e+00> : vector<32x32xf32> + %c1 = arith.constant 1 : index + %c48 = arith.constant 48 : index + %c2 = arith.constant 2 : index + %c4 = arith.constant 4 : index + %c32 = arith.constant 32 : index + %c0 = arith.constant 0 : index + %0 = memref.get_global @__constant_48x32x32xf32 : memref<48x32x32xf32> + %alloc = memref.alloc() {alignment = 64 : i64} : memref<8x48x32x32xf32> + scf.forall (%arg1, %arg2) in (8, 48) { + %subview = memref.subview %alloc[%arg1, %arg2, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : memref<8x48x32x32xf32> to memref<32x32xf32, strided<[32, 1], offset: ?>> + vector.transfer_write %cst_0, %subview[%c0, %c0] {in_bounds = [true, true]} : vector<32x32xf32>, memref<32x32xf32, strided<[32, 1], offset: ?>> + %subview_2 = memref.subview %arg0[%arg1, 0, 0, 0] [1, 48, 32, 32] [1, 1, 1, 1] : memref<8x48x32x32xf32> to memref<48x32x32xf32, strided<[1024, 32, 1], offset: ?>> + scf.for %arg3 = %c0 to %c32 step %c4 { + scf.for %arg4 = %c0 to %c32 step %c2 { + %subview_3 = memref.subview %subview[%arg3, %arg4] [4, 2] [1, 1] : memref<32x32xf32, strided<[32, 1], offset: ?>> to memref<4x2xf32, strided<[32, 1], offset: ?>> + scf.for %arg5 = %c0 to %c48 step %c1 { + scf.for %arg6 = %c0 to %c32 step %c4 { + %subview_4 = memref.subview %subview_2[%arg5, %arg3, %arg6] [1, 4, 4] [1, 1, 1] : memref<48x32x32xf32, strided<[1024, 32, 1], offset: ?>> to memref<1x4x4xf32, strided<[1024, 32, 1], offset: ?>> + %subview_5 = memref.subview %0[%arg5, %arg6, %arg4] [1, 4, 2] [1, 1, 1] : memref<48x32x32xf32> to memref<1x4x2xf32, strided<[1024, 32, 1], offset: ?>> + %1 = vector.transfer_read %subview_4[%c0, %c0, %c0], %cst {in_bounds = [true, true, true]} : memref<1x4x4xf32, strided<[1024, 32, 1], offset: ?>>, vector<1x4x4xf32> + %2 = vector.transfer_read %subview_5[%c0, %c0, %c0], %cst {in_bounds = [true, true, true]} : memref<1x4x2xf32, strided<[1024, 32, 1], offset: ?>>, vector<1x4x2xf32> + %3 = vector.transfer_read %subview_3[%c0, %c0], %cst {in_bounds = [true, true]} : memref<4x2xf32, strided<[32, 1], offset: ?>>, vector<4x2xf32> + %4 = vector.contract {indexing_maps = [#map, #map1, #map2], iterator_types = ["reduction", "parallel", "parallel", "reduction"], kind = #vector.kind} %1, %2, %3 : vector<1x4x4xf32>, vector<1x4x2xf32> into vector<4x2xf32> + vector.transfer_write %4, %subview_3[%c0, %c0] {in_bounds = [true, true]} : vector<4x2xf32>, memref<4x2xf32, strided<[32, 1], offset: ?>> + } + } + } + } + } + %alloc_1 = memref.alloc() {alignment = 64 : i64} : memref<8x48x32x32xf32> + scf.forall (%arg1, %arg2) in (8, 48) { + %subview = memref.subview %alloc_1[%arg1, %arg2, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : memref<8x48x32x32xf32> to memref<32x32xf32, strided<[32, 1], offset: ?>> + vector.transfer_write %cst_0, %subview[%c0, %c0] {in_bounds = [true, true]} : vector<32x32xf32>, memref<32x32xf32, strided<[32, 1], offset: ?>> + %subview_2 = memref.subview %alloc[%arg1, 0, 0, 0] [1, 48, 32, 32] [1, 1, 1, 1] : memref<8x48x32x32xf32> to memref<48x32x32xf32, strided<[1024, 32, 1], offset: ?>> + scf.for %arg3 = %c0 to %c32 step %c4 { + scf.for %arg4 = %c0 to %c32 step %c2 { + %subview_3 = memref.subview %subview[%arg3, %arg4] [4, 2] [1, 1] : memref<32x32xf32, strided<[32, 1], offset: ?>> to memref<4x2xf32, strided<[32, 1], offset: ?>> + scf.for %arg5 = %c0 to %c48 step %c1 { + scf.for %arg6 = %c0 to %c32 step %c4 { + %subview_4 = memref.subview %subview_2[%arg5, %arg3, %arg6] [1, 4, 4] [1, 1, 1] : memref<48x32x32xf32, strided<[1024, 32, 1], offset: ?>> to memref<1x4x4xf32, strided<[1024, 32, 1], offset: ?>> + %subview_5 = memref.subview %0[%arg5, %arg6, %arg4] [1, 4, 2] [1, 1, 1] : memref<48x32x32xf32> to memref<1x4x2xf32, strided<[1024, 32, 1], offset: ?>> + %1 = vector.transfer_read %subview_4[%c0, %c0, %c0], %cst {in_bounds = [true, true, true]} : memref<1x4x4xf32, strided<[1024, 32, 1], offset: ?>>, vector<1x4x4xf32> + %2 = vector.transfer_read %subview_5[%c0, %c0, %c0], %cst {in_bounds = [true, true, true]} : memref<1x4x2xf32, strided<[1024, 32, 1], offset: ?>>, vector<1x4x2xf32> + %3 = vector.transfer_read %subview_3[%c0, %c0], %cst {in_bounds = [true, true]} : memref<4x2xf32, strided<[32, 1], offset: ?>>, vector<4x2xf32> + %4 = vector.contract {indexing_maps = [#map, #map1, #map2], iterator_types = ["reduction", "parallel", "parallel", "reduction"], kind = #vector.kind} %1, %2, %3 : vector<1x4x4xf32>, vector<1x4x2xf32> into vector<4x2xf32> + vector.transfer_write %4, %subview_3[%c0, %c0] {in_bounds = [true, true]} : vector<4x2xf32>, memref<4x2xf32, strided<[32, 1], offset: ?>> + } + } + } + } + } + scf.forall (%arg1, %arg2) in (8, 48) { + %subview = memref.subview %alloc[%arg1, %arg2, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : memref<8x48x32x32xf32> to memref<32x32xf32, strided<[32, 1], offset: ?>> + vector.transfer_write %cst_0, %subview[%c0, %c0] {in_bounds = [true, true]} : vector<32x32xf32>, memref<32x32xf32, strided<[32, 1], offset: ?>> + %subview_2 = memref.subview %alloc_1[%arg1, 0, 0, 0] [1, 48, 32, 32] [1, 1, 1, 1] : memref<8x48x32x32xf32> to memref<48x32x32xf32, strided<[1024, 32, 1], offset: ?>> + scf.for %arg3 = %c0 to %c32 step %c4 { + scf.for %arg4 = %c0 to %c32 step %c2 { + %subview_3 = memref.subview %subview[%arg3, %arg4] [4, 2] [1, 1] : memref<32x32xf32, strided<[32, 1], offset: ?>> to memref<4x2xf32, strided<[32, 1], offset: ?>> + scf.for %arg5 = %c0 to %c48 step %c1 { + scf.for %arg6 = %c0 to %c32 step %c4 { + %subview_4 = memref.subview %subview_2[%arg5, %arg3, %arg6] [1, 4, 4] [1, 1, 1] : memref<48x32x32xf32, strided<[1024, 32, 1], offset: ?>> to memref<1x4x4xf32, strided<[1024, 32, 1], offset: ?>> + %subview_5 = memref.subview %0[%arg5, %arg6, %arg4] [1, 4, 2] [1, 1, 1] : memref<48x32x32xf32> to memref<1x4x2xf32, strided<[1024, 32, 1], offset: ?>> + %1 = vector.transfer_read %subview_4[%c0, %c0, %c0], %cst {in_bounds = [true, true, true]} : memref<1x4x4xf32, strided<[1024, 32, 1], offset: ?>>, vector<1x4x4xf32> + %2 = vector.transfer_read %subview_5[%c0, %c0, %c0], %cst {in_bounds = [true, true, true]} : memref<1x4x2xf32, strided<[1024, 32, 1], offset: ?>>, vector<1x4x2xf32> + %3 = vector.transfer_read %subview_3[%c0, %c0], %cst {in_bounds = [true, true]} : memref<4x2xf32, strided<[32, 1], offset: ?>>, vector<4x2xf32> + %4 = vector.contract {indexing_maps = [#map, #map1, #map2], iterator_types = ["reduction", "parallel", "parallel", "reduction"], kind = #vector.kind} %1, %2, %3 : vector<1x4x4xf32>, vector<1x4x2xf32> into vector<4x2xf32> + vector.transfer_write %4, %subview_3[%c0, %c0] {in_bounds = [true, true]} : vector<4x2xf32>, memref<4x2xf32, strided<[32, 1], offset: ?>> + } + } + } + } + } + return %alloc : memref<8x48x32x32xf32> + } + + +// ----- + +// RUN: tpp-opt %s | tpp-run -e entry --entry-point-result=void -print > %t.1 +// RUN: tpp-opt %s --hoist-vector-transfer | tpp-run -e entry --entry-point-result=void -print > %t.2 +// RUN: diff %t.1 %t.2 | FileCheck %s --check-prefix=DIFF --allow-empty + +// DIFF-NOT: {{.}} +#map20 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)> +#map21 = affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)> +#map22 = affine_map<(d0, d1, d2, d3) -> (d1, d2)> + memref.global "private" constant @__constant_24x64x64xf32 : memref<24x64x64xf32> = dense<1.000000e+00> {alignment = 64 : i64} + func.func @entry(%arg0: memref<8x24x32x64xf32>) -> memref<8x24x32x64xf32> { + %cst = arith.constant 0.000000e+00 : f32 + %cst_0 = arith.constant dense<0.000000e+00> : vector<32x64xf32> + %c1 = arith.constant 1 : index + %c24 = arith.constant 24 : index + %c64 = arith.constant 64 : index + %c4 = arith.constant 4 : index + %c32 = arith.constant 32 : index + %c0 = arith.constant 0 : index + %0 = memref.get_global @__constant_24x64x64xf32 : memref<24x64x64xf32> + %alloc = memref.alloc() {alignment = 64 : i64} : memref<8x24x32x64xf32> + scf.forall (%arg1, %arg2) in (8, 24) { + %subview = memref.subview %alloc[%arg1, %arg2, 0, 0] [1, 1, 32, 64] [1, 1, 1, 1] : memref<8x24x32x64xf32> to memref<32x64xf32, strided<[64, 1], offset: ?>> + vector.transfer_write %cst_0, %subview[%c0, %c0] {in_bounds = [true, true]} : vector<32x64xf32>, memref<32x64xf32, strided<[64, 1], offset: ?>> + %subview_1 = memref.subview %arg0[%arg1, 0, 0, 0] [1, 24, 32, 64] [1, 1, 1, 1] : memref<8x24x32x64xf32> to memref<24x32x64xf32, strided<[2048, 64, 1], offset: ?>> + scf.for %arg3 = %c0 to %c32 step %c4 { + scf.for %arg4 = %c0 to %c64 step %c64 { + %subview_2 = memref.subview %subview[%arg3, %arg4] [4, 64] [1, 1] : memref<32x64xf32, strided<[64, 1], offset: ?>> to memref<4x64xf32, strided<[64, 1], offset: ?>> + scf.for %arg5 = %c0 to %c24 step %c1 { + scf.for %arg6 = %c0 to %c64 step %c1 { + %subview_3 = memref.subview %subview_1[%arg5, %arg3, %arg6] [1, 4, 1] [1, 1, 1] : memref<24x32x64xf32, strided<[2048, 64, 1], offset: ?>> to memref<1x4x1xf32, strided<[2048, 64, 1], offset: ?>> + %subview_4 = memref.subview %0[%arg5, %arg6, %arg4] [1, 1, 64] [1, 1, 1] : memref<24x64x64xf32> to memref<1x1x64xf32, strided<[4096, 64, 1], offset: ?>> + %1 = vector.transfer_read %subview_3[%c0, %c0, %c0], %cst {in_bounds = [true, true, true]} : memref<1x4x1xf32, strided<[2048, 64, 1], offset: ?>>, vector<1x4x1xf32> + %2 = vector.transfer_read %subview_4[%c0, %c0, %c0], %cst {in_bounds = [true, true, true]} : memref<1x1x64xf32, strided<[4096, 64, 1], offset: ?>>, vector<1x1x64xf32> + %3 = vector.transfer_read %subview_2[%c0, %c0], %cst {in_bounds = [true, true]} : memref<4x64xf32, strided<[64, 1], offset: ?>>, vector<4x64xf32> + %4 = vector.contract {indexing_maps = [#map20, #map21, #map22], iterator_types = ["reduction", "parallel", "parallel", "reduction"], kind = #vector.kind} %1, %2, %3 : vector<1x4x1xf32>, vector<1x1x64xf32> into vector<4x64xf32> + vector.transfer_write %4, %subview_2[%c0, %c0] {in_bounds = [true, true]} : vector<4x64xf32>, memref<4x64xf32, strided<[64, 1], offset: ?>> + } + } + } + } + } + return %alloc : memref<8x24x32x64xf32> + } + + From a25c754701b2256eb62a1c40bef6ef1a09178717 Mon Sep 17 00:00:00 2001 From: athangam Date: Sun, 10 Nov 2024 23:59:21 -0800 Subject: [PATCH 02/14] Checking Integration Tests for hoisting pass --- .../hoist-vector-transfer-brgemm.mlir | 95 +------------------ 1 file changed, 2 insertions(+), 93 deletions(-) diff --git a/test/Integration/hoist-vector-transfer-brgemm.mlir b/test/Integration/hoist-vector-transfer-brgemm.mlir index 4a6cac3b1..301b5e39f 100644 --- a/test/Integration/hoist-vector-transfer-brgemm.mlir +++ b/test/Integration/hoist-vector-transfer-brgemm.mlir @@ -1,94 +1,3 @@ -// RUN: tpp-opt %s | tpp-run -e chainedGEMM --entry-point-result=void -print > %t.1 -// RUN: tpp-opt %s --hoist-vector-transfer | tpp-run -e chainedGEMM --entry-point-result=void -print > %t.2 -// RUN: diff %t.1 %t.2 | FileCheck %s --check-prefix=DIFF-TWO --allow-empty - -// DIFF-TWO-NOT: {{.}} -#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)> -#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)> -#map2 = affine_map<(d0, d1, d2, d3) -> (d1, d2)> - - memref.global "private" constant @__constant_48x32x32xf32 : memref<48x32x32xf32> = dense<1.000000e+00> {alignment = 64 : i64} - func.func @chainedGEMM(%arg0: memref<8x48x32x32xf32>) -> memref<8x48x32x32xf32> { - %cst = arith.constant 0.000000e+00 : f32 - %cst_0 = arith.constant dense<0.000000e+00> : vector<32x32xf32> - %c1 = arith.constant 1 : index - %c48 = arith.constant 48 : index - %c2 = arith.constant 2 : index - %c4 = arith.constant 4 : index - %c32 = arith.constant 32 : index - %c0 = arith.constant 0 : index - %0 = memref.get_global @__constant_48x32x32xf32 : memref<48x32x32xf32> - %alloc = memref.alloc() {alignment = 64 : i64} : memref<8x48x32x32xf32> - scf.forall (%arg1, %arg2) in (8, 48) { - %subview = memref.subview %alloc[%arg1, %arg2, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : memref<8x48x32x32xf32> to memref<32x32xf32, strided<[32, 1], offset: ?>> - vector.transfer_write %cst_0, %subview[%c0, %c0] {in_bounds = [true, true]} : vector<32x32xf32>, memref<32x32xf32, strided<[32, 1], offset: ?>> - %subview_2 = memref.subview %arg0[%arg1, 0, 0, 0] [1, 48, 32, 32] [1, 1, 1, 1] : memref<8x48x32x32xf32> to memref<48x32x32xf32, strided<[1024, 32, 1], offset: ?>> - scf.for %arg3 = %c0 to %c32 step %c4 { - scf.for %arg4 = %c0 to %c32 step %c2 { - %subview_3 = memref.subview %subview[%arg3, %arg4] [4, 2] [1, 1] : memref<32x32xf32, strided<[32, 1], offset: ?>> to memref<4x2xf32, strided<[32, 1], offset: ?>> - scf.for %arg5 = %c0 to %c48 step %c1 { - scf.for %arg6 = %c0 to %c32 step %c4 { - %subview_4 = memref.subview %subview_2[%arg5, %arg3, %arg6] [1, 4, 4] [1, 1, 1] : memref<48x32x32xf32, strided<[1024, 32, 1], offset: ?>> to memref<1x4x4xf32, strided<[1024, 32, 1], offset: ?>> - %subview_5 = memref.subview %0[%arg5, %arg6, %arg4] [1, 4, 2] [1, 1, 1] : memref<48x32x32xf32> to memref<1x4x2xf32, strided<[1024, 32, 1], offset: ?>> - %1 = vector.transfer_read %subview_4[%c0, %c0, %c0], %cst {in_bounds = [true, true, true]} : memref<1x4x4xf32, strided<[1024, 32, 1], offset: ?>>, vector<1x4x4xf32> - %2 = vector.transfer_read %subview_5[%c0, %c0, %c0], %cst {in_bounds = [true, true, true]} : memref<1x4x2xf32, strided<[1024, 32, 1], offset: ?>>, vector<1x4x2xf32> - %3 = vector.transfer_read %subview_3[%c0, %c0], %cst {in_bounds = [true, true]} : memref<4x2xf32, strided<[32, 1], offset: ?>>, vector<4x2xf32> - %4 = vector.contract {indexing_maps = [#map, #map1, #map2], iterator_types = ["reduction", "parallel", "parallel", "reduction"], kind = #vector.kind} %1, %2, %3 : vector<1x4x4xf32>, vector<1x4x2xf32> into vector<4x2xf32> - vector.transfer_write %4, %subview_3[%c0, %c0] {in_bounds = [true, true]} : vector<4x2xf32>, memref<4x2xf32, strided<[32, 1], offset: ?>> - } - } - } - } - } - %alloc_1 = memref.alloc() {alignment = 64 : i64} : memref<8x48x32x32xf32> - scf.forall (%arg1, %arg2) in (8, 48) { - %subview = memref.subview %alloc_1[%arg1, %arg2, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : memref<8x48x32x32xf32> to memref<32x32xf32, strided<[32, 1], offset: ?>> - vector.transfer_write %cst_0, %subview[%c0, %c0] {in_bounds = [true, true]} : vector<32x32xf32>, memref<32x32xf32, strided<[32, 1], offset: ?>> - %subview_2 = memref.subview %alloc[%arg1, 0, 0, 0] [1, 48, 32, 32] [1, 1, 1, 1] : memref<8x48x32x32xf32> to memref<48x32x32xf32, strided<[1024, 32, 1], offset: ?>> - scf.for %arg3 = %c0 to %c32 step %c4 { - scf.for %arg4 = %c0 to %c32 step %c2 { - %subview_3 = memref.subview %subview[%arg3, %arg4] [4, 2] [1, 1] : memref<32x32xf32, strided<[32, 1], offset: ?>> to memref<4x2xf32, strided<[32, 1], offset: ?>> - scf.for %arg5 = %c0 to %c48 step %c1 { - scf.for %arg6 = %c0 to %c32 step %c4 { - %subview_4 = memref.subview %subview_2[%arg5, %arg3, %arg6] [1, 4, 4] [1, 1, 1] : memref<48x32x32xf32, strided<[1024, 32, 1], offset: ?>> to memref<1x4x4xf32, strided<[1024, 32, 1], offset: ?>> - %subview_5 = memref.subview %0[%arg5, %arg6, %arg4] [1, 4, 2] [1, 1, 1] : memref<48x32x32xf32> to memref<1x4x2xf32, strided<[1024, 32, 1], offset: ?>> - %1 = vector.transfer_read %subview_4[%c0, %c0, %c0], %cst {in_bounds = [true, true, true]} : memref<1x4x4xf32, strided<[1024, 32, 1], offset: ?>>, vector<1x4x4xf32> - %2 = vector.transfer_read %subview_5[%c0, %c0, %c0], %cst {in_bounds = [true, true, true]} : memref<1x4x2xf32, strided<[1024, 32, 1], offset: ?>>, vector<1x4x2xf32> - %3 = vector.transfer_read %subview_3[%c0, %c0], %cst {in_bounds = [true, true]} : memref<4x2xf32, strided<[32, 1], offset: ?>>, vector<4x2xf32> - %4 = vector.contract {indexing_maps = [#map, #map1, #map2], iterator_types = ["reduction", "parallel", "parallel", "reduction"], kind = #vector.kind} %1, %2, %3 : vector<1x4x4xf32>, vector<1x4x2xf32> into vector<4x2xf32> - vector.transfer_write %4, %subview_3[%c0, %c0] {in_bounds = [true, true]} : vector<4x2xf32>, memref<4x2xf32, strided<[32, 1], offset: ?>> - } - } - } - } - } - scf.forall (%arg1, %arg2) in (8, 48) { - %subview = memref.subview %alloc[%arg1, %arg2, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : memref<8x48x32x32xf32> to memref<32x32xf32, strided<[32, 1], offset: ?>> - vector.transfer_write %cst_0, %subview[%c0, %c0] {in_bounds = [true, true]} : vector<32x32xf32>, memref<32x32xf32, strided<[32, 1], offset: ?>> - %subview_2 = memref.subview %alloc_1[%arg1, 0, 0, 0] [1, 48, 32, 32] [1, 1, 1, 1] : memref<8x48x32x32xf32> to memref<48x32x32xf32, strided<[1024, 32, 1], offset: ?>> - scf.for %arg3 = %c0 to %c32 step %c4 { - scf.for %arg4 = %c0 to %c32 step %c2 { - %subview_3 = memref.subview %subview[%arg3, %arg4] [4, 2] [1, 1] : memref<32x32xf32, strided<[32, 1], offset: ?>> to memref<4x2xf32, strided<[32, 1], offset: ?>> - scf.for %arg5 = %c0 to %c48 step %c1 { - scf.for %arg6 = %c0 to %c32 step %c4 { - %subview_4 = memref.subview %subview_2[%arg5, %arg3, %arg6] [1, 4, 4] [1, 1, 1] : memref<48x32x32xf32, strided<[1024, 32, 1], offset: ?>> to memref<1x4x4xf32, strided<[1024, 32, 1], offset: ?>> - %subview_5 = memref.subview %0[%arg5, %arg6, %arg4] [1, 4, 2] [1, 1, 1] : memref<48x32x32xf32> to memref<1x4x2xf32, strided<[1024, 32, 1], offset: ?>> - %1 = vector.transfer_read %subview_4[%c0, %c0, %c0], %cst {in_bounds = [true, true, true]} : memref<1x4x4xf32, strided<[1024, 32, 1], offset: ?>>, vector<1x4x4xf32> - %2 = vector.transfer_read %subview_5[%c0, %c0, %c0], %cst {in_bounds = [true, true, true]} : memref<1x4x2xf32, strided<[1024, 32, 1], offset: ?>>, vector<1x4x2xf32> - %3 = vector.transfer_read %subview_3[%c0, %c0], %cst {in_bounds = [true, true]} : memref<4x2xf32, strided<[32, 1], offset: ?>>, vector<4x2xf32> - %4 = vector.contract {indexing_maps = [#map, #map1, #map2], iterator_types = ["reduction", "parallel", "parallel", "reduction"], kind = #vector.kind} %1, %2, %3 : vector<1x4x4xf32>, vector<1x4x2xf32> into vector<4x2xf32> - vector.transfer_write %4, %subview_3[%c0, %c0] {in_bounds = [true, true]} : vector<4x2xf32>, memref<4x2xf32, strided<[32, 1], offset: ?>> - } - } - } - } - } - return %alloc : memref<8x48x32x32xf32> - } - - -// ----- - // RUN: tpp-opt %s | tpp-run -e entry --entry-point-result=void -print > %t.1 // RUN: tpp-opt %s --hoist-vector-transfer | tpp-run -e entry --entry-point-result=void -print > %t.2 // RUN: diff %t.1 %t.2 | FileCheck %s --check-prefix=DIFF --allow-empty @@ -97,6 +6,7 @@ #map20 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)> #map21 = affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)> #map22 = affine_map<(d0, d1, d2, d3) -> (d1, d2)> +module { memref.global "private" constant @__constant_24x64x64xf32 : memref<24x64x64xf32> = dense<1.000000e+00> {alignment = 64 : i64} func.func @entry(%arg0: memref<8x24x32x64xf32>) -> memref<8x24x32x64xf32> { %cst = arith.constant 0.000000e+00 : f32 @@ -132,5 +42,4 @@ } return %alloc : memref<8x24x32x64xf32> } - - +} From 6b9663735e64a10d637a2c9be01a69378ab2014a Mon Sep 17 00:00:00 2001 From: athangam Date: Mon, 11 Nov 2024 01:16:16 -0800 Subject: [PATCH 03/14] Checking Integration Tests for hoisting pass --- test/Integration/hoist-vector-transfer-brgemm.mlir | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/Integration/hoist-vector-transfer-brgemm.mlir b/test/Integration/hoist-vector-transfer-brgemm.mlir index 301b5e39f..0afcd470b 100644 --- a/test/Integration/hoist-vector-transfer-brgemm.mlir +++ b/test/Integration/hoist-vector-transfer-brgemm.mlir @@ -1,5 +1,5 @@ // RUN: tpp-opt %s | tpp-run -e entry --entry-point-result=void -print > %t.1 -// RUN: tpp-opt %s --hoist-vector-transfer | tpp-run -e entry --entry-point-result=void -print > %t.2 +// RUN: tpp-opt %s --hoist-vector-transfer // RUN: diff %t.1 %t.2 | FileCheck %s --check-prefix=DIFF --allow-empty // DIFF-NOT: {{.}} From 99c458888548e46a7a7fe7f7b4439cc786da459a Mon Sep 17 00:00:00 2001 From: athangam Date: Mon, 11 Nov 2024 01:27:01 -0800 Subject: [PATCH 04/14] Checking Integration Tests for hoisting pass --- test/Integration/hoist-vector-transfer-brgemm.mlir | 2 -- 1 file changed, 2 deletions(-) diff --git a/test/Integration/hoist-vector-transfer-brgemm.mlir b/test/Integration/hoist-vector-transfer-brgemm.mlir index 0afcd470b..4dda2c8a5 100644 --- a/test/Integration/hoist-vector-transfer-brgemm.mlir +++ b/test/Integration/hoist-vector-transfer-brgemm.mlir @@ -1,6 +1,4 @@ // RUN: tpp-opt %s | tpp-run -e entry --entry-point-result=void -print > %t.1 -// RUN: tpp-opt %s --hoist-vector-transfer -// RUN: diff %t.1 %t.2 | FileCheck %s --check-prefix=DIFF --allow-empty // DIFF-NOT: {{.}} #map20 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)> From 69c09bd9e2195374fa19aa1adc5d70e3accdae9c Mon Sep 17 00:00:00 2001 From: athangam Date: Mon, 11 Nov 2024 01:38:25 -0800 Subject: [PATCH 05/14] Checking Integration Tests for hoisting pass --- test/Integration/hoist-vector-transfer-brgemm.mlir | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/Integration/hoist-vector-transfer-brgemm.mlir b/test/Integration/hoist-vector-transfer-brgemm.mlir index 4dda2c8a5..88db1dc5f 100644 --- a/test/Integration/hoist-vector-transfer-brgemm.mlir +++ b/test/Integration/hoist-vector-transfer-brgemm.mlir @@ -1,4 +1,4 @@ -// RUN: tpp-opt %s | tpp-run -e entry --entry-point-result=void -print > %t.1 +// RUN: tpp-opt %s | tpp-run -e entry --entry-point-result=void -print // DIFF-NOT: {{.}} #map20 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)> From 3de0bb142c03528902c97bfeb1510e89a81ab1da Mon Sep 17 00:00:00 2001 From: athangam Date: Mon, 11 Nov 2024 06:19:26 -0800 Subject: [PATCH 06/14] Checking Integration Tests for hoisting pass --- test/Integration/hoist-vector-transfer-brgemm.mlir | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/test/Integration/hoist-vector-transfer-brgemm.mlir b/test/Integration/hoist-vector-transfer-brgemm.mlir index 88db1dc5f..c79aaf458 100644 --- a/test/Integration/hoist-vector-transfer-brgemm.mlir +++ b/test/Integration/hoist-vector-transfer-brgemm.mlir @@ -1,4 +1,6 @@ -// RUN: tpp-opt %s | tpp-run -e entry --entry-point-result=void -print +// RUN: tpp-opt %s | tpp-run -e entry --entry-point-result=void -print > %t.1 +// RUN: tpp-opt %s | tpp-run -e entry --entry-point-result=void -print > %t.2 +// RUN: diff %t.1 %t.2 | FileCheck %s --check-prefix=DIFF --allow-empty // DIFF-NOT: {{.}} #map20 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)> From ae469cab6db9119772e2abc0358e6d32c71c96d7 Mon Sep 17 00:00:00 2001 From: athangam Date: Mon, 11 Nov 2024 06:25:56 -0800 Subject: [PATCH 07/14] Checking Integration Tests for hoisting pass --- .../hoist-vector-transfer-brgemm.mlir | 24 +++++++------------ 1 file changed, 8 insertions(+), 16 deletions(-) diff --git a/test/Integration/hoist-vector-transfer-brgemm.mlir b/test/Integration/hoist-vector-transfer-brgemm.mlir index c79aaf458..a14e3e7af 100644 --- a/test/Integration/hoist-vector-transfer-brgemm.mlir +++ b/test/Integration/hoist-vector-transfer-brgemm.mlir @@ -3,38 +3,30 @@ // RUN: diff %t.1 %t.2 | FileCheck %s --check-prefix=DIFF --allow-empty // DIFF-NOT: {{.}} -#map20 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)> -#map21 = affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)> -#map22 = affine_map<(d0, d1, d2, d3) -> (d1, d2)> -module { +module { memref.global "private" constant @__constant_24x64x64xf32 : memref<24x64x64xf32> = dense<1.000000e+00> {alignment = 64 : i64} func.func @entry(%arg0: memref<8x24x32x64xf32>) -> memref<8x24x32x64xf32> { - %cst = arith.constant 0.000000e+00 : f32 - %cst_0 = arith.constant dense<0.000000e+00> : vector<32x64xf32> %c1 = arith.constant 1 : index %c24 = arith.constant 24 : index %c64 = arith.constant 64 : index %c4 = arith.constant 4 : index %c32 = arith.constant 32 : index %c0 = arith.constant 0 : index + %cst = arith.constant 0.000000e+00 : f32 %0 = memref.get_global @__constant_24x64x64xf32 : memref<24x64x64xf32> %alloc = memref.alloc() {alignment = 64 : i64} : memref<8x24x32x64xf32> scf.forall (%arg1, %arg2) in (8, 24) { %subview = memref.subview %alloc[%arg1, %arg2, 0, 0] [1, 1, 32, 64] [1, 1, 1, 1] : memref<8x24x32x64xf32> to memref<32x64xf32, strided<[64, 1], offset: ?>> - vector.transfer_write %cst_0, %subview[%c0, %c0] {in_bounds = [true, true]} : vector<32x64xf32>, memref<32x64xf32, strided<[64, 1], offset: ?>> - %subview_1 = memref.subview %arg0[%arg1, 0, 0, 0] [1, 24, 32, 64] [1, 1, 1, 1] : memref<8x24x32x64xf32> to memref<24x32x64xf32, strided<[2048, 64, 1], offset: ?>> + linalg.fill ins(%cst : f32) outs(%subview : memref<32x64xf32, strided<[64, 1], offset: ?>>) + %subview_0 = memref.subview %arg0[%arg1, 0, 0, 0] [1, 24, 32, 64] [1, 1, 1, 1] : memref<8x24x32x64xf32> to memref<24x32x64xf32, strided<[2048, 64, 1], offset: ?>> scf.for %arg3 = %c0 to %c32 step %c4 { scf.for %arg4 = %c0 to %c64 step %c64 { - %subview_2 = memref.subview %subview[%arg3, %arg4] [4, 64] [1, 1] : memref<32x64xf32, strided<[64, 1], offset: ?>> to memref<4x64xf32, strided<[64, 1], offset: ?>> + %subview_1 = memref.subview %subview[%arg3, %arg4] [4, 64] [1, 1] : memref<32x64xf32, strided<[64, 1], offset: ?>> to memref<4x64xf32, strided<[64, 1], offset: ?>> scf.for %arg5 = %c0 to %c24 step %c1 { scf.for %arg6 = %c0 to %c64 step %c1 { - %subview_3 = memref.subview %subview_1[%arg5, %arg3, %arg6] [1, 4, 1] [1, 1, 1] : memref<24x32x64xf32, strided<[2048, 64, 1], offset: ?>> to memref<1x4x1xf32, strided<[2048, 64, 1], offset: ?>> - %subview_4 = memref.subview %0[%arg5, %arg6, %arg4] [1, 1, 64] [1, 1, 1] : memref<24x64x64xf32> to memref<1x1x64xf32, strided<[4096, 64, 1], offset: ?>> - %1 = vector.transfer_read %subview_3[%c0, %c0, %c0], %cst {in_bounds = [true, true, true]} : memref<1x4x1xf32, strided<[2048, 64, 1], offset: ?>>, vector<1x4x1xf32> - %2 = vector.transfer_read %subview_4[%c0, %c0, %c0], %cst {in_bounds = [true, true, true]} : memref<1x1x64xf32, strided<[4096, 64, 1], offset: ?>>, vector<1x1x64xf32> - %3 = vector.transfer_read %subview_2[%c0, %c0], %cst {in_bounds = [true, true]} : memref<4x64xf32, strided<[64, 1], offset: ?>>, vector<4x64xf32> - %4 = vector.contract {indexing_maps = [#map20, #map21, #map22], iterator_types = ["reduction", "parallel", "parallel", "reduction"], kind = #vector.kind} %1, %2, %3 : vector<1x4x1xf32>, vector<1x1x64xf32> into vector<4x64xf32> - vector.transfer_write %4, %subview_2[%c0, %c0] {in_bounds = [true, true]} : vector<4x64xf32>, memref<4x64xf32, strided<[64, 1], offset: ?>> + %subview_2 = memref.subview %subview_0[%arg5, %arg3, %arg6] [1, 4, 1] [1, 1, 1] : memref<24x32x64xf32, strided<[2048, 64, 1], offset: ?>> to memref<1x4x1xf32, strided<[2048, 64, 1], offset: ?>> + %subview_3 = memref.subview %0[%arg5, %arg6, %arg4] [1, 1, 64] [1, 1, 1] : memref<24x64x64xf32> to memref<1x1x64xf32, strided<[4096, 64, 1], offset: ?>> + linalg.batch_reduce_matmul ins(%subview_2, %subview_3 : memref<1x4x1xf32, strided<[2048, 64, 1], offset: ?>>, memref<1x1x64xf32, strided<[4096, 64, 1], offset: ?>>) outs(%subview_1 : memref<4x64xf32, strided<[64, 1], offset: ?>>) } } } From b49d72a343939826de54fcbf0fd25183eddab548 Mon Sep 17 00:00:00 2001 From: athangam Date: Mon, 11 Nov 2024 16:49:14 -0800 Subject: [PATCH 08/14] Checking Integration Tests for hoisting pass --- test/Integration/hoist-vector-transfer-brgemm.mlir | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/Integration/hoist-vector-transfer-brgemm.mlir b/test/Integration/hoist-vector-transfer-brgemm.mlir index a14e3e7af..cfb9e4dc0 100644 --- a/test/Integration/hoist-vector-transfer-brgemm.mlir +++ b/test/Integration/hoist-vector-transfer-brgemm.mlir @@ -1,5 +1,5 @@ // RUN: tpp-opt %s | tpp-run -e entry --entry-point-result=void -print > %t.1 -// RUN: tpp-opt %s | tpp-run -e entry --entry-point-result=void -print > %t.2 +// RUN: tpp-opt %s --loop-invariant-code-motion --vectorization-pass --loop-invariant-code-motion --hoist-vector-transfer | tpp-run -e entry --entry-point-result=void -print > %t.2 // RUN: diff %t.1 %t.2 | FileCheck %s --check-prefix=DIFF --allow-empty // DIFF-NOT: {{.}} From 4c37c14bfb996a1b7e8ae7a2e271baa922ef686c Mon Sep 17 00:00:00 2001 From: athangam Date: Mon, 11 Nov 2024 18:14:27 -0800 Subject: [PATCH 09/14] Added the unit test cases --- ...oist-vector-transfer-operation-brgemm.mlir | 277 ++++++++++++++++++ 1 file changed, 277 insertions(+) create mode 100644 test/Passes/pass-hoist-vector-transfer-operation-brgemm.mlir diff --git a/test/Passes/pass-hoist-vector-transfer-operation-brgemm.mlir b/test/Passes/pass-hoist-vector-transfer-operation-brgemm.mlir new file mode 100644 index 000000000..2b457ac9f --- /dev/null +++ b/test/Passes/pass-hoist-vector-transfer-operation-brgemm.mlir @@ -0,0 +1,277 @@ +// RUN: tpp-opt %s --hoist-vector-transfer --split-input-file | FileCheck %s + + +#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)> +#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)> +#map2 = affine_map<(d0, d1, d2, d3) -> (d1, d2)> +module { + memref.global "private" constant @__constant_24x64x64xf32 : memref<24x64x64xf32> = dense<1.000000e+00> {alignment = 64 : i64} + func.func @entry(%arg0: memref<8x24x32x64xf32>) -> memref<8x24x32x64xf32> { + %cst = arith.constant 0.000000e+00 : f32 + %cst_0 = arith.constant dense<0.000000e+00> : vector<32x64xf32> + %c1 = arith.constant 1 : index + %c24 = arith.constant 24 : index + %c64 = arith.constant 64 : index + %c4 = arith.constant 4 : index + %c32 = arith.constant 32 : index + %c0 = arith.constant 0 : index + %0 = memref.get_global @__constant_24x64x64xf32 : memref<24x64x64xf32> + %alloc = memref.alloc() {alignment = 64 : i64} : memref<8x24x32x64xf32> + scf.forall (%arg1, %arg2) in (8, 24) { + %subview = memref.subview %alloc[%arg1, %arg2, 0, 0] [1, 1, 32, 64] [1, 1, 1, 1] : memref<8x24x32x64xf32> to memref<32x64xf32, strided<[64, 1], offset: ?>> + vector.transfer_write %cst_0, %subview[%c0, %c0] {in_bounds = [true, true]} : vector<32x64xf32>, memref<32x64xf32, strided<[64, 1], offset: ?>> + %subview_1 = memref.subview %arg0[%arg1, 0, 0, 0] [1, 24, 32, 64] [1, 1, 1, 1] : memref<8x24x32x64xf32> to memref<24x32x64xf32, strided<[2048, 64, 1], offset: ?>> + scf.for %arg3 = %c0 to %c32 step %c4 { + scf.for %arg4 = %c0 to %c64 step %c64 { + %subview_2 = memref.subview %subview[%arg3, %arg4] [4, 64] [1, 1] : memref<32x64xf32, strided<[64, 1], offset: ?>> to memref<4x64xf32, strided<[64, 1], offset: ?>> + scf.for %arg5 = %c0 to %c24 step %c1 { + scf.for %arg6 = %c0 to %c64 step %c1 { + %subview_3 = memref.subview %subview_1[%arg5, %arg3, %arg6] [1, 4, 1] [1, 1, 1] : memref<24x32x64xf32, strided<[2048, 64, 1], offset: ?>> to memref<1x4x1xf32, strided<[2048, 64, 1], offset: ?>> + %subview_4 = memref.subview %0[%arg5, %arg6, %arg4] [1, 1, 64] [1, 1, 1] : memref<24x64x64xf32> to memref<1x1x64xf32, strided<[4096, 64, 1], offset: ?>> + %1 = vector.transfer_read %subview_3[%c0, %c0, %c0], %cst {in_bounds = [true, true, true]} : memref<1x4x1xf32, strided<[2048, 64, 1], offset: ?>>, vector<1x4x1xf32> + %2 = vector.transfer_read %subview_4[%c0, %c0, %c0], %cst {in_bounds = [true, true, true]} : memref<1x1x64xf32, strided<[4096, 64, 1], offset: ?>>, vector<1x1x64xf32> + %3 = vector.transfer_read %subview_2[%c0, %c0], %cst {in_bounds = [true, true]} : memref<4x64xf32, strided<[64, 1], offset: ?>>, vector<4x64xf32> + %4 = vector.contract {indexing_maps = [#map, #map1, #map2], iterator_types = ["reduction", "parallel", "parallel", "reduction"], kind = #vector.kind} %1, %2, %3 : vector<1x4x1xf32>, vector<1x1x64xf32> into vector<4x64xf32> + vector.transfer_write %4, %subview_2[%c0, %c0] {in_bounds = [true, true]} : vector<4x64xf32>, memref<4x64xf32, strided<[64, 1], offset: ?>> + } + } + } + } + } + return %alloc : memref<8x24x32x64xf32> + } +} + + + + +// CHECK: #[[$ATTR_0:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)> +// CHECK: #[[$ATTR_1:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)> +// CHECK: #[[$ATTR_2:.+]] = affine_map<(d0, d1, d2, d3) -> (d1, d2)> + +// CHECK-LABEL: memref.global "private" constant @__constant_24x64x64xf32 : memref<24x64x64xf32> = dense<1.000000e+00> {alignment = 64 : i64} + +// CHECK-LABEL: func.func @entry( +// CHECK-SAME: %[[VAL_0:.*]]: memref<8x24x32x64xf32>) -> memref<8x24x32x64xf32> { +// CHECK: %[[VAL_1:.*]] = arith.constant 0.000000e+00 : f32 +// CHECK: %[[VAL_2:.*]] = arith.constant dense<0.000000e+00> : vector<32x64xf32> +// CHECK: %[[VAL_3:.*]] = arith.constant 1 : index +// CHECK: %[[VAL_4:.*]] = arith.constant 24 : index +// CHECK: %[[VAL_5:.*]] = arith.constant 64 : index +// CHECK: %[[VAL_6:.*]] = arith.constant 4 : index +// CHECK: %[[VAL_7:.*]] = arith.constant 32 : index +// CHECK: %[[VAL_8:.*]] = arith.constant 0 : index +// CHECK: %[[VAL_9:.*]] = memref.get_global @__constant_24x64x64xf32 : memref<24x64x64xf32> +// CHECK: %[[VAL_10:.*]] = memref.alloc() {alignment = 64 : i64} : memref<8x24x32x64xf32> +// CHECK: scf.forall (%[[VAL_11:.*]], %[[VAL_12:.*]]) in (8, 24) { +// CHECK: %[[VAL_13:.*]] = memref.subview %[[VAL_10]]{{\[}}%[[VAL_11]], %[[VAL_12]], 0, 0] [1, 1, 32, 64] [1, 1, 1, 1] : memref<8x24x32x64xf32> to memref<32x64xf32, strided<[64, 1], offset: ?>> +// CHECK: vector.transfer_write %[[VAL_2]], %[[VAL_13]]{{\[}}%[[VAL_8]], %[[VAL_8]]] {in_bounds = [true, true]} : vector<32x64xf32>, memref<32x64xf32, strided<[64, 1], offset: ?>> +// CHECK: %[[VAL_14:.*]] = memref.subview %[[VAL_0]]{{\[}}%[[VAL_11]], 0, 0, 0] [1, 24, 32, 64] [1, 1, 1, 1] : memref<8x24x32x64xf32> to memref<24x32x64xf32, strided<[2048, 64, 1], offset: ?>> +// CHECK: scf.for %[[VAL_15:.*]] = %[[VAL_8]] to %[[VAL_7]] step %[[VAL_6]] { +// CHECK: scf.for %[[VAL_16:.*]] = %[[VAL_8]] to %[[VAL_5]] step %[[VAL_5]] { +// CHECK: %[[VAL_17:.*]] = memref.subview %[[VAL_13]]{{\[}}%[[VAL_15]], %[[VAL_16]]] [4, 64] [1, 1] : memref<32x64xf32, strided<[64, 1], offset: ?>> to memref<4x64xf32, strided<[64, 1], offset: ?>> +// CHECK: %[[VAL_18:.*]] = vector.transfer_read %[[VAL_17]]{{\[}}%[[VAL_8]], %[[VAL_8]]], %[[VAL_1]] {in_bounds = [true, true]} : memref<4x64xf32, strided<[64, 1], offset: ?>>, vector<4x64xf32> +// CHECK: %[[VAL_19:.*]] = scf.for %[[VAL_20:.*]] = %[[VAL_8]] to %[[VAL_4]] step %[[VAL_3]] iter_args(%[[VAL_21:.*]] = %[[VAL_18]]) -> (vector<4x64xf32>) { +// CHECK: %[[VAL_22:.*]] = scf.for %[[VAL_23:.*]] = %[[VAL_8]] to %[[VAL_5]] step %[[VAL_3]] iter_args(%[[VAL_24:.*]] = %[[VAL_21]]) -> (vector<4x64xf32>) { +// CHECK: %[[VAL_25:.*]] = memref.subview %[[VAL_14]]{{\[}}%[[VAL_20]], %[[VAL_15]], %[[VAL_23]]] [1, 4, 1] [1, 1, 1] : memref<24x32x64xf32, strided<[2048, 64, 1], offset: ?>> to memref<1x4x1xf32, strided<[2048, 64, 1], offset: ?>> +// CHECK: %[[VAL_26:.*]] = memref.subview %[[VAL_9]]{{\[}}%[[VAL_20]], %[[VAL_23]], %[[VAL_16]]] [1, 1, 64] [1, 1, 1] : memref<24x64x64xf32> to memref<1x1x64xf32, strided<[4096, 64, 1], offset: ?>> +// CHECK: %[[VAL_27:.*]] = vector.transfer_read %[[VAL_25]]{{\[}}%[[VAL_8]], %[[VAL_8]], %[[VAL_8]]], %[[VAL_1]] {in_bounds = [true, true, true]} : memref<1x4x1xf32, strided<[2048, 64, 1], offset: ?>>, vector<1x4x1xf32> +// CHECK: %[[VAL_28:.*]] = vector.transfer_read %[[VAL_26]]{{\[}}%[[VAL_8]], %[[VAL_8]], %[[VAL_8]]], %[[VAL_1]] {in_bounds = [true, true, true]} : memref<1x1x64xf32, strided<[4096, 64, 1], offset: ?>>, vector<1x1x64xf32> +// CHECK: %[[VAL_29:.*]] = vector.contract {indexing_maps = [#map, #map1, #map2], iterator_types = ["reduction", "parallel", "parallel", "reduction"], kind = #vector.kind} %4, %5, %arg8 : vector<1x4x1xf32>, vector<1x1x64xf32> into vector<4x64xf32> +// CHECK: scf.yield %[[VAL_29]] : vector<4x64xf32> +// CHECK: } +// CHECK: scf.yield %[[VAL_22]] : vector<4x64xf32> +// CHECK: } +// CHECK: vector.transfer_write %[[VAL_19]], %[[VAL_17]]{{\[}}%[[VAL_8]], %[[VAL_8]]] {in_bounds = [true, true]} : vector<4x64xf32>, memref<4x64xf32, strided<[64, 1], offset: ?>> +// CHECK: } +// CHECK: } +// CHECK: } +// CHECK: return %[[VAL_10]] : memref<8x24x32x64xf32> +// CHECK: } + + + + +// ----- + +// RUN: tpp-opt %s --hoist-vector-transfer --split-input-file | FileCheck %s + + +#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)> +#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)> +#map2 = affine_map<(d0, d1, d2, d3) -> (d1, d2)> +module { + memref.global "private" constant @__constant_48x32x32xf32 : memref<48x32x32xf32> = dense<1.000000e+00> {alignment = 64 : i64} + func.func @entry(%arg0: memref<8x48x32x32xf32>) -> memref<8x48x32x32xf32> { + %cst = arith.constant 0.000000e+00 : f32 + %cst_0 = arith.constant dense<0.000000e+00> : vector<32x32xf32> + %c1 = arith.constant 1 : index + %c48 = arith.constant 48 : index + %c2 = arith.constant 2 : index + %c4 = arith.constant 4 : index + %c32 = arith.constant 32 : index + %c0 = arith.constant 0 : index + %0 = memref.get_global @__constant_48x32x32xf32 : memref<48x32x32xf32> + %alloc = memref.alloc() {alignment = 64 : i64} : memref<8x48x32x32xf32> + scf.forall (%arg1, %arg2) in (8, 48) { + %subview = memref.subview %alloc[%arg1, %arg2, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : memref<8x48x32x32xf32> to memref<32x32xf32, strided<[32, 1], offset: ?>> + vector.transfer_write %cst_0, %subview[%c0, %c0] {in_bounds = [true, true]} : vector<32x32xf32>, memref<32x32xf32, strided<[32, 1], offset: ?>> + %subview_2 = memref.subview %arg0[%arg1, 0, 0, 0] [1, 48, 32, 32] [1, 1, 1, 1] : memref<8x48x32x32xf32> to memref<48x32x32xf32, strided<[1024, 32, 1], offset: ?>> + scf.for %arg3 = %c0 to %c32 step %c4 { + scf.for %arg4 = %c0 to %c32 step %c2 { + %subview_3 = memref.subview %subview[%arg3, %arg4] [4, 2] [1, 1] : memref<32x32xf32, strided<[32, 1], offset: ?>> to memref<4x2xf32, strided<[32, 1], offset: ?>> + scf.for %arg5 = %c0 to %c48 step %c1 { + scf.for %arg6 = %c0 to %c32 step %c4 { + %subview_4 = memref.subview %subview_2[%arg5, %arg3, %arg6] [1, 4, 4] [1, 1, 1] : memref<48x32x32xf32, strided<[1024, 32, 1], offset: ?>> to memref<1x4x4xf32, strided<[1024, 32, 1], offset: ?>> + %subview_5 = memref.subview %0[%arg5, %arg6, %arg4] [1, 4, 2] [1, 1, 1] : memref<48x32x32xf32> to memref<1x4x2xf32, strided<[1024, 32, 1], offset: ?>> + %1 = vector.transfer_read %subview_4[%c0, %c0, %c0], %cst {in_bounds = [true, true, true]} : memref<1x4x4xf32, strided<[1024, 32, 1], offset: ?>>, vector<1x4x4xf32> + %2 = vector.transfer_read %subview_5[%c0, %c0, %c0], %cst {in_bounds = [true, true, true]} : memref<1x4x2xf32, strided<[1024, 32, 1], offset: ?>>, vector<1x4x2xf32> + %3 = vector.transfer_read %subview_3[%c0, %c0], %cst {in_bounds = [true, true]} : memref<4x2xf32, strided<[32, 1], offset: ?>>, vector<4x2xf32> + %4 = vector.contract {indexing_maps = [#map, #map1, #map2], iterator_types = ["reduction", "parallel", "parallel", "reduction"], kind = #vector.kind} %1, %2, %3 : vector<1x4x4xf32>, vector<1x4x2xf32> into vector<4x2xf32> + vector.transfer_write %4, %subview_3[%c0, %c0] {in_bounds = [true, true]} : vector<4x2xf32>, memref<4x2xf32, strided<[32, 1], offset: ?>> + } + } + } + } + } + %alloc_1 = memref.alloc() {alignment = 64 : i64} : memref<8x48x32x32xf32> + scf.forall (%arg1, %arg2) in (8, 48) { + %subview = memref.subview %alloc_1[%arg1, %arg2, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : memref<8x48x32x32xf32> to memref<32x32xf32, strided<[32, 1], offset: ?>> + vector.transfer_write %cst_0, %subview[%c0, %c0] {in_bounds = [true, true]} : vector<32x32xf32>, memref<32x32xf32, strided<[32, 1], offset: ?>> + %subview_2 = memref.subview %alloc[%arg1, 0, 0, 0] [1, 48, 32, 32] [1, 1, 1, 1] : memref<8x48x32x32xf32> to memref<48x32x32xf32, strided<[1024, 32, 1], offset: ?>> + scf.for %arg3 = %c0 to %c32 step %c4 { + scf.for %arg4 = %c0 to %c32 step %c2 { + %subview_3 = memref.subview %subview[%arg3, %arg4] [4, 2] [1, 1] : memref<32x32xf32, strided<[32, 1], offset: ?>> to memref<4x2xf32, strided<[32, 1], offset: ?>> + scf.for %arg5 = %c0 to %c48 step %c1 { + scf.for %arg6 = %c0 to %c32 step %c4 { + %subview_4 = memref.subview %subview_2[%arg5, %arg3, %arg6] [1, 4, 4] [1, 1, 1] : memref<48x32x32xf32, strided<[1024, 32, 1], offset: ?>> to memref<1x4x4xf32, strided<[1024, 32, 1], offset: ?>> + %subview_5 = memref.subview %0[%arg5, %arg6, %arg4] [1, 4, 2] [1, 1, 1] : memref<48x32x32xf32> to memref<1x4x2xf32, strided<[1024, 32, 1], offset: ?>> + %1 = vector.transfer_read %subview_4[%c0, %c0, %c0], %cst {in_bounds = [true, true, true]} : memref<1x4x4xf32, strided<[1024, 32, 1], offset: ?>>, vector<1x4x4xf32> + %2 = vector.transfer_read %subview_5[%c0, %c0, %c0], %cst {in_bounds = [true, true, true]} : memref<1x4x2xf32, strided<[1024, 32, 1], offset: ?>>, vector<1x4x2xf32> + %3 = vector.transfer_read %subview_3[%c0, %c0], %cst {in_bounds = [true, true]} : memref<4x2xf32, strided<[32, 1], offset: ?>>, vector<4x2xf32> + %4 = vector.contract {indexing_maps = [#map, #map1, #map2], iterator_types = ["reduction", "parallel", "parallel", "reduction"], kind = #vector.kind} %1, %2, %3 : vector<1x4x4xf32>, vector<1x4x2xf32> into vector<4x2xf32> + vector.transfer_write %4, %subview_3[%c0, %c0] {in_bounds = [true, true]} : vector<4x2xf32>, memref<4x2xf32, strided<[32, 1], offset: ?>> + } + } + } + } + } + scf.forall (%arg1, %arg2) in (8, 48) { + %subview = memref.subview %alloc[%arg1, %arg2, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : memref<8x48x32x32xf32> to memref<32x32xf32, strided<[32, 1], offset: ?>> + vector.transfer_write %cst_0, %subview[%c0, %c0] {in_bounds = [true, true]} : vector<32x32xf32>, memref<32x32xf32, strided<[32, 1], offset: ?>> + %subview_2 = memref.subview %alloc_1[%arg1, 0, 0, 0] [1, 48, 32, 32] [1, 1, 1, 1] : memref<8x48x32x32xf32> to memref<48x32x32xf32, strided<[1024, 32, 1], offset: ?>> + scf.for %arg3 = %c0 to %c32 step %c4 { + scf.for %arg4 = %c0 to %c32 step %c2 { + %subview_3 = memref.subview %subview[%arg3, %arg4] [4, 2] [1, 1] : memref<32x32xf32, strided<[32, 1], offset: ?>> to memref<4x2xf32, strided<[32, 1], offset: ?>> + scf.for %arg5 = %c0 to %c48 step %c1 { + scf.for %arg6 = %c0 to %c32 step %c4 { + %subview_4 = memref.subview %subview_2[%arg5, %arg3, %arg6] [1, 4, 4] [1, 1, 1] : memref<48x32x32xf32, strided<[1024, 32, 1], offset: ?>> to memref<1x4x4xf32, strided<[1024, 32, 1], offset: ?>> + %subview_5 = memref.subview %0[%arg5, %arg6, %arg4] [1, 4, 2] [1, 1, 1] : memref<48x32x32xf32> to memref<1x4x2xf32, strided<[1024, 32, 1], offset: ?>> + %1 = vector.transfer_read %subview_4[%c0, %c0, %c0], %cst {in_bounds = [true, true, true]} : memref<1x4x4xf32, strided<[1024, 32, 1], offset: ?>>, vector<1x4x4xf32> + %2 = vector.transfer_read %subview_5[%c0, %c0, %c0], %cst {in_bounds = [true, true, true]} : memref<1x4x2xf32, strided<[1024, 32, 1], offset: ?>>, vector<1x4x2xf32> + %3 = vector.transfer_read %subview_3[%c0, %c0], %cst {in_bounds = [true, true]} : memref<4x2xf32, strided<[32, 1], offset: ?>>, vector<4x2xf32> + %4 = vector.contract {indexing_maps = [#map, #map1, #map2], iterator_types = ["reduction", "parallel", "parallel", "reduction"], kind = #vector.kind} %1, %2, %3 : vector<1x4x4xf32>, vector<1x4x2xf32> into vector<4x2xf32> + vector.transfer_write %4, %subview_3[%c0, %c0] {in_bounds = [true, true]} : vector<4x2xf32>, memref<4x2xf32, strided<[32, 1], offset: ?>> + } + } + } + } + } + return %alloc : memref<8x48x32x32xf32> + } +} + + + + +// CHECK: #[[$ATTR_0:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)> +// CHECK: #[[$ATTR_1:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)> +// CHECK: #[[$ATTR_2:.+]] = affine_map<(d0, d1, d2, d3) -> (d1, d2)> + + +// CHECK-LABEL: memref.global "private" constant @__constant_48x32x32xf32 : memref<48x32x32xf32> = dense<1.000000e+00> {alignment = 64 : i64} + +// CHECK-LABEL: func.func @entry( +// CHECK-SAME: %[[VAL_0:.*]]: memref<8x48x32x32xf32>) -> memref<8x48x32x32xf32> { +// CHECK: %[[VAL_1:.*]] = arith.constant 0.000000e+00 : f32 +// CHECK: %[[VAL_2:.*]] = arith.constant dense<0.000000e+00> : vector<32x32xf32> +// CHECK: %[[VAL_3:.*]] = arith.constant 1 : index +// CHECK: %[[VAL_4:.*]] = arith.constant 48 : index +// CHECK: %[[VAL_5:.*]] = arith.constant 2 : index +// CHECK: %[[VAL_6:.*]] = arith.constant 4 : index +// CHECK: %[[VAL_7:.*]] = arith.constant 32 : index +// CHECK: %[[VAL_8:.*]] = arith.constant 0 : index +// CHECK: %[[VAL_9:.*]] = memref.get_global @__constant_48x32x32xf32 : memref<48x32x32xf32> +// CHECK: %[[VAL_10:.*]] = memref.alloc() {alignment = 64 : i64} : memref<8x48x32x32xf32> +// CHECK: scf.forall (%[[VAL_11:.*]], %[[VAL_12:.*]]) in (8, 48) { +// CHECK: %[[VAL_13:.*]] = memref.subview %[[VAL_10]]{{\[}}%[[VAL_11]], %[[VAL_12]], 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : memref<8x48x32x32xf32> to memref<32x32xf32, strided<[32, 1], offset: ?>> +// CHECK: vector.transfer_write %[[VAL_2]], %[[VAL_13]]{{\[}}%[[VAL_8]], %[[VAL_8]]] {in_bounds = [true, true]} : vector<32x32xf32>, memref<32x32xf32, strided<[32, 1], offset: ?>> +// CHECK: %[[VAL_14:.*]] = memref.subview %[[VAL_0]]{{\[}}%[[VAL_11]], 0, 0, 0] [1, 48, 32, 32] [1, 1, 1, 1] : memref<8x48x32x32xf32> to memref<48x32x32xf32, strided<[1024, 32, 1], offset: ?>> +// CHECK: scf.for %[[VAL_15:.*]] = %[[VAL_8]] to %[[VAL_7]] step %[[VAL_6]] { +// CHECK: scf.for %[[VAL_16:.*]] = %[[VAL_8]] to %[[VAL_7]] step %[[VAL_5]] { +// CHECK: %[[VAL_17:.*]] = memref.subview %[[VAL_13]]{{\[}}%[[VAL_15]], %[[VAL_16]]] [4, 2] [1, 1] : memref<32x32xf32, strided<[32, 1], offset: ?>> to memref<4x2xf32, strided<[32, 1], offset: ?>> +// CHECK: %[[VAL_18:.*]] = vector.transfer_read %[[VAL_17]]{{\[}}%[[VAL_8]], %[[VAL_8]]], %[[VAL_1]] {in_bounds = [true, true]} : memref<4x2xf32, strided<[32, 1], offset: ?>>, vector<4x2xf32> +// CHECK: %[[VAL_19:.*]] = scf.for %[[VAL_20:.*]] = %[[VAL_8]] to %[[VAL_4]] step %[[VAL_3]] iter_args(%[[VAL_21:.*]] = %[[VAL_18]]) -> (vector<4x2xf32>) { +// CHECK: %[[VAL_22:.*]] = scf.for %[[VAL_23:.*]] = %[[VAL_8]] to %[[VAL_7]] step %[[VAL_6]] iter_args(%[[VAL_24:.*]] = %[[VAL_21]]) -> (vector<4x2xf32>) { +// CHECK: %[[VAL_25:.*]] = memref.subview %[[VAL_14]]{{\[}}%[[VAL_20]], %[[VAL_15]], %[[VAL_23]]] [1, 4, 4] [1, 1, 1] : memref<48x32x32xf32, strided<[1024, 32, 1], offset: ?>> to memref<1x4x4xf32, strided<[1024, 32, 1], offset: ?>> +// CHECK: %[[VAL_26:.*]] = memref.subview %[[VAL_9]]{{\[}}%[[VAL_20]], %[[VAL_23]], %[[VAL_16]]] [1, 4, 2] [1, 1, 1] : memref<48x32x32xf32> to memref<1x4x2xf32, strided<[1024, 32, 1], offset: ?>> +// CHECK: %[[VAL_27:.*]] = vector.transfer_read %[[VAL_25]]{{\[}}%[[VAL_8]], %[[VAL_8]], %[[VAL_8]]], %[[VAL_1]] {in_bounds = [true, true, true]} : memref<1x4x4xf32, strided<[1024, 32, 1], offset: ?>>, vector<1x4x4xf32> +// CHECK: %[[VAL_28:.*]] = vector.transfer_read %[[VAL_26]]{{\[}}%[[VAL_8]], %[[VAL_8]], %[[VAL_8]]], %[[VAL_1]] {in_bounds = [true, true, true]} : memref<1x4x2xf32, strided<[1024, 32, 1], offset: ?>>, vector<1x4x2xf32> +// CHECK: %[[VAL_29:.*]] = vector.contract {indexing_maps = [#map, #map1, #map2], iterator_types = ["reduction", "parallel", "parallel", "reduction"], kind = #vector.kind} %4, %5, %arg8 : vector<1x4x4xf32>, vector<1x4x2xf32> into vector<4x2xf32> +// CHECK: scf.yield %[[VAL_29]] : vector<4x2xf32> +// CHECK: } +// CHECK: scf.yield %[[VAL_22]] : vector<4x2xf32> +// CHECK: } +// CHECK: vector.transfer_write %[[VAL_19]], %[[VAL_17]]{{\[}}%[[VAL_8]], %[[VAL_8]]] {in_bounds = [true, true]} : vector<4x2xf32>, memref<4x2xf32, strided<[32, 1], offset: ?>> +// CHECK: } +// CHECK: } +// CHECK: } +// CHECK: %[[VAL_30:.*]] = memref.alloc() {alignment = 64 : i64} : memref<8x48x32x32xf32> +// CHECK: scf.forall (%[[VAL_31:.*]], %[[VAL_32:.*]]) in (8, 48) { +// CHECK: %[[VAL_33:.*]] = memref.subview %[[VAL_30]]{{\[}}%[[VAL_31]], %[[VAL_32]], 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : memref<8x48x32x32xf32> to memref<32x32xf32, strided<[32, 1], offset: ?>> +// CHECK: vector.transfer_write %[[VAL_2]], %[[VAL_33]]{{\[}}%[[VAL_8]], %[[VAL_8]]] {in_bounds = [true, true]} : vector<32x32xf32>, memref<32x32xf32, strided<[32, 1], offset: ?>> +// CHECK: %[[VAL_34:.*]] = memref.subview %[[VAL_10]]{{\[}}%[[VAL_31]], 0, 0, 0] [1, 48, 32, 32] [1, 1, 1, 1] : memref<8x48x32x32xf32> to memref<48x32x32xf32, strided<[1024, 32, 1], offset: ?>> +// CHECK: scf.for %[[VAL_35:.*]] = %[[VAL_8]] to %[[VAL_7]] step %[[VAL_6]] { +// CHECK: scf.for %[[VAL_36:.*]] = %[[VAL_8]] to %[[VAL_7]] step %[[VAL_5]] { +// CHECK: %[[VAL_37:.*]] = memref.subview %[[VAL_33]]{{\[}}%[[VAL_35]], %[[VAL_36]]] [4, 2] [1, 1] : memref<32x32xf32, strided<[32, 1], offset: ?>> to memref<4x2xf32, strided<[32, 1], offset: ?>> +// CHECK: %[[VAL_38:.*]] = vector.transfer_read %[[VAL_37]]{{\[}}%[[VAL_8]], %[[VAL_8]]], %[[VAL_1]] {in_bounds = [true, true]} : memref<4x2xf32, strided<[32, 1], offset: ?>>, vector<4x2xf32> +// CHECK: %[[VAL_39:.*]] = scf.for %[[VAL_40:.*]] = %[[VAL_8]] to %[[VAL_4]] step %[[VAL_3]] iter_args(%[[VAL_41:.*]] = %[[VAL_38]]) -> (vector<4x2xf32>) { +// CHECK: %[[VAL_42:.*]] = scf.for %[[VAL_43:.*]] = %[[VAL_8]] to %[[VAL_7]] step %[[VAL_6]] iter_args(%[[VAL_44:.*]] = %[[VAL_41]]) -> (vector<4x2xf32>) { +// CHECK: %[[VAL_45:.*]] = memref.subview %[[VAL_34]]{{\[}}%[[VAL_40]], %[[VAL_35]], %[[VAL_43]]] [1, 4, 4] [1, 1, 1] : memref<48x32x32xf32, strided<[1024, 32, 1], offset: ?>> to memref<1x4x4xf32, strided<[1024, 32, 1], offset: ?>> +// CHECK: %[[VAL_46:.*]] = memref.subview %[[VAL_9]]{{\[}}%[[VAL_40]], %[[VAL_43]], %[[VAL_36]]] [1, 4, 2] [1, 1, 1] : memref<48x32x32xf32> to memref<1x4x2xf32, strided<[1024, 32, 1], offset: ?>> +// CHECK: %[[VAL_47:.*]] = vector.transfer_read %[[VAL_45]]{{\[}}%[[VAL_8]], %[[VAL_8]], %[[VAL_8]]], %[[VAL_1]] {in_bounds = [true, true, true]} : memref<1x4x4xf32, strided<[1024, 32, 1], offset: ?>>, vector<1x4x4xf32> +// CHECK: %[[VAL_48:.*]] = vector.transfer_read %[[VAL_46]]{{\[}}%[[VAL_8]], %[[VAL_8]], %[[VAL_8]]], %[[VAL_1]] {in_bounds = [true, true, true]} : memref<1x4x2xf32, strided<[1024, 32, 1], offset: ?>>, vector<1x4x2xf32> +// CHECK: %[[VAL_49:.*]] = vector.contract {indexing_maps = [#map, #map1, #map2], iterator_types = ["reduction", "parallel", "parallel", "reduction"], kind = #vector.kind} %4, %5, %arg8 : vector<1x4x4xf32>, vector<1x4x2xf32> into vector<4x2xf32> +// CHECK: scf.yield %[[VAL_49]] : vector<4x2xf32> +// CHECK: } +// CHECK: scf.yield %[[VAL_42]] : vector<4x2xf32> +// CHECK: } +// CHECK: vector.transfer_write %[[VAL_39]], %[[VAL_37]]{{\[}}%[[VAL_8]], %[[VAL_8]]] {in_bounds = [true, true]} : vector<4x2xf32>, memref<4x2xf32, strided<[32, 1], offset: ?>> +// CHECK: } +// CHECK: } +// CHECK: } +// CHECK: scf.forall (%[[VAL_50:.*]], %[[VAL_51:.*]]) in (8, 48) { +// CHECK: %[[VAL_52:.*]] = memref.subview %[[VAL_10]]{{\[}}%[[VAL_50]], %[[VAL_51]], 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : memref<8x48x32x32xf32> to memref<32x32xf32, strided<[32, 1], offset: ?>> +// CHECK: vector.transfer_write %[[VAL_2]], %[[VAL_52]]{{\[}}%[[VAL_8]], %[[VAL_8]]] {in_bounds = [true, true]} : vector<32x32xf32>, memref<32x32xf32, strided<[32, 1], offset: ?>> +// CHECK: %[[VAL_53:.*]] = memref.subview %[[VAL_30]]{{\[}}%[[VAL_50]], 0, 0, 0] [1, 48, 32, 32] [1, 1, 1, 1] : memref<8x48x32x32xf32> to memref<48x32x32xf32, strided<[1024, 32, 1], offset: ?>> +// CHECK: scf.for %[[VAL_54:.*]] = %[[VAL_8]] to %[[VAL_7]] step %[[VAL_6]] { +// CHECK: scf.for %[[VAL_55:.*]] = %[[VAL_8]] to %[[VAL_7]] step %[[VAL_5]] { +// CHECK: %[[VAL_56:.*]] = memref.subview %[[VAL_52]]{{\[}}%[[VAL_54]], %[[VAL_55]]] [4, 2] [1, 1] : memref<32x32xf32, strided<[32, 1], offset: ?>> to memref<4x2xf32, strided<[32, 1], offset: ?>> +// CHECK: %[[VAL_57:.*]] = vector.transfer_read %[[VAL_56]]{{\[}}%[[VAL_8]], %[[VAL_8]]], %[[VAL_1]] {in_bounds = [true, true]} : memref<4x2xf32, strided<[32, 1], offset: ?>>, vector<4x2xf32> +// CHECK: %[[VAL_58:.*]] = scf.for %[[VAL_59:.*]] = %[[VAL_8]] to %[[VAL_4]] step %[[VAL_3]] iter_args(%[[VAL_60:.*]] = %[[VAL_57]]) -> (vector<4x2xf32>) { +// CHECK: %[[VAL_61:.*]] = scf.for %[[VAL_62:.*]] = %[[VAL_8]] to %[[VAL_7]] step %[[VAL_6]] iter_args(%[[VAL_63:.*]] = %[[VAL_60]]) -> (vector<4x2xf32>) { +// CHECK: %[[VAL_64:.*]] = memref.subview %[[VAL_53]]{{\[}}%[[VAL_59]], %[[VAL_54]], %[[VAL_62]]] [1, 4, 4] [1, 1, 1] : memref<48x32x32xf32, strided<[1024, 32, 1], offset: ?>> to memref<1x4x4xf32, strided<[1024, 32, 1], offset: ?>> +// CHECK: %[[VAL_65:.*]] = memref.subview %[[VAL_9]]{{\[}}%[[VAL_59]], %[[VAL_62]], %[[VAL_55]]] [1, 4, 2] [1, 1, 1] : memref<48x32x32xf32> to memref<1x4x2xf32, strided<[1024, 32, 1], offset: ?>> +// CHECK: %[[VAL_66:.*]] = vector.transfer_read %[[VAL_64]]{{\[}}%[[VAL_8]], %[[VAL_8]], %[[VAL_8]]], %[[VAL_1]] {in_bounds = [true, true, true]} : memref<1x4x4xf32, strided<[1024, 32, 1], offset: ?>>, vector<1x4x4xf32> +// CHECK: %[[VAL_67:.*]] = vector.transfer_read %[[VAL_65]]{{\[}}%[[VAL_8]], %[[VAL_8]], %[[VAL_8]]], %[[VAL_1]] {in_bounds = [true, true, true]} : memref<1x4x2xf32, strided<[1024, 32, 1], offset: ?>>, vector<1x4x2xf32> +// CHECK: %[[VAL_68:.*]] = vector.contract {indexing_maps = [#map, #map1, #map2], iterator_types = ["reduction", "parallel", "parallel", "reduction"], kind = #vector.kind} %4, %5, %arg8 : vector<1x4x4xf32>, vector<1x4x2xf32> into vector<4x2xf32> +// CHECK: scf.yield %[[VAL_68]] : vector<4x2xf32> +// CHECK: } +// CHECK: scf.yield %[[VAL_61]] : vector<4x2xf32> +// CHECK: } +// CHECK: vector.transfer_write %[[VAL_58]], %[[VAL_56]]{{\[}}%[[VAL_8]], %[[VAL_8]]] {in_bounds = [true, true]} : vector<4x2xf32>, memref<4x2xf32, strided<[32, 1], offset: ?>> +// CHECK: } +// CHECK: } +// CHECK: } +// CHECK: return %[[VAL_10]] : memref<8x48x32x32xf32> +// CHECK: } From 1fb6d33405ade020d474b3731754cc2aae99af99 Mon Sep 17 00:00:00 2001 From: athangam Date: Mon, 11 Nov 2024 20:29:36 -0800 Subject: [PATCH 10/14] Some clean ups to the file(s) --- include/TPP/Passes.td | 2 +- lib/TPP/Transforms/HoistVectorTransfers.cpp | 23 +++++++++++---------- 2 files changed, 13 insertions(+), 12 deletions(-) diff --git a/include/TPP/Passes.td b/include/TPP/Passes.td index 586828a90..67c5f675f 100644 --- a/include/TPP/Passes.td +++ b/include/TPP/Passes.td @@ -58,7 +58,7 @@ def VectorizationPass : Pass<"vectorization-pass", def HoistVectorTransfers : Pass<"hoist-vector-transfer"> { let summary = "Hoist vector transfer operation outside of reduction and k loop"; let description = [{ - Hoists the vector transfer read and write operations of 'C' matrix outside the reduction and k loop for an brgemm operation. This pass should be applied after the BrgemmLinalgTiling Pass. + Hoists the vector transfer read and write operations of the resultant matrix outside the reduction and k loop for a brgemm operation. This pass should be applied after the BrgemmLinalgTiling Pass. }]; let dependentDialects = [ "vector::VectorDialect" ]; } diff --git a/lib/TPP/Transforms/HoistVectorTransfers.cpp b/lib/TPP/Transforms/HoistVectorTransfers.cpp index 7d55cae3e..08fb1cf4a 100644 --- a/lib/TPP/Transforms/HoistVectorTransfers.cpp +++ b/lib/TPP/Transforms/HoistVectorTransfers.cpp @@ -1,3 +1,4 @@ +//===-HoistVectorTransfers.cpp -----------------------------------------*- C++-*-===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -41,35 +42,35 @@ struct HoistVectorTransferOp : OpRewritePattern { LogicalResult matchAndRewrite(vector::ContractionOp contractOp, PatternRewriter &rewriter) const override { - //llvm::outs() << "The defining operation is: Arun" << "\n"; + // Code to hoist vector transfer read before the reduction and k loop - auto vectorReadOp = contractOp.getOperand(contractOp.getNumOperands()-1).getDefiningOp(); - if (vectorReadOp) { + if (auto vectorReadOp = contractOp.getOperand(contractOp.getNumOperands()-1).getDefiningOp()) { auto subviewOp = vectorReadOp->getOperand(0).getDefiningOp(); rewriter.setInsertionPointAfter(subviewOp); + auto retriveVectorReadOp = llvm::dyn_cast(vectorReadOp); auto *cloneVectorReadOp = rewriter.clone(*retriveVectorReadOp); - contractOp.setOperand(contractOp.getNumOperands()-1, (*cloneVectorReadOp).getResult(0)); retriveVectorReadOp.replaceAllUsesWith(cloneVectorReadOp); - // Code to re-create the reduction and k loop with iter args to + // Code to re-create the reduction and k loop with iter args auto *nextOp = (*cloneVectorReadOp).getNextNode(); - if (nextOp) { + if (auto oldReductionForOp = llvm::dyn_cast(*nextOp)) { + if (auto oldKForOp = llvm::dyn_cast(oldReductionForOp.getBody()->front())) { auto vectorReadOpValue = (*cloneVectorReadOp).getResult(0); - auto oldReductionForOp = llvm::dyn_cast(*nextOp); - auto oldKForOp = llvm::dyn_cast(oldReductionForOp.getBody()->front()); - rewriter.setInsertionPoint(oldReductionForOp); + auto newReductionForOp = rewriter.create( oldReductionForOp.getLoc(), oldReductionForOp.getLowerBound(), oldReductionForOp.getUpperBound(), oldReductionForOp.getStep(),ValueRange{vectorReadOpValue}, [&](OpBuilder &rewriterNewReductionForOp, Location locNewReductionForOp, Value ivNewReductionForOp, ValueRange iterArgsNewReductionForOp) { + auto newKForOp = rewriter.create( oldKForOp.getLoc(), oldKForOp.getLowerBound(), oldKForOp.getUpperBound(), oldKForOp.getStep(), iterArgsNewReductionForOp, [&](OpBuilder &rewriterNewKForOp, Location locNewKForOp, Value ivNewKForOp, ValueRange iterArgsNewKForOp) { + mlir::IRMapping mapper; mapper.map(oldReductionForOp.getInductionVar(), ivNewReductionForOp); mapper.map(oldKForOp.getInductionVar(), ivNewKForOp); @@ -118,14 +119,14 @@ struct HoistVectorTransferOp : OpRewritePattern { vectorWriteOperation->moveBefore(oldReductionForOp); } - // Erase the vector contract operation + // Erase the old vector contract operation for (auto result : contractOp->getResults()) { for (auto *userOp : result.getUsers()) { userOp->erase(); } } contractOp.erase(); - + } } } return success(); From 8e0aa2800233c34577162b79f5f58611337e534b Mon Sep 17 00:00:00 2001 From: athangam Date: Tue, 12 Nov 2024 18:38:28 -0800 Subject: [PATCH 11/14] Some clean ups to the file(s) --- include/TPP/Passes.td | 2 +- lib/TPP/Transforms/HoistVectorTransfers.cpp | 5 +---- 2 files changed, 2 insertions(+), 5 deletions(-) diff --git a/include/TPP/Passes.td b/include/TPP/Passes.td index 67c5f675f..36d74939f 100644 --- a/include/TPP/Passes.td +++ b/include/TPP/Passes.td @@ -60,7 +60,7 @@ def HoistVectorTransfers : Pass<"hoist-vector-transfer"> { let description = [{ Hoists the vector transfer read and write operations of the resultant matrix outside the reduction and k loop for a brgemm operation. This pass should be applied after the BrgemmLinalgTiling Pass. }]; - let dependentDialects = [ "vector::VectorDialect" ]; + let dependentDialects = [ "vector::VectorDialect", "scf::SCFDialect" ]; } diff --git a/lib/TPP/Transforms/HoistVectorTransfers.cpp b/lib/TPP/Transforms/HoistVectorTransfers.cpp index 08fb1cf4a..5ae1d695b 100644 --- a/lib/TPP/Transforms/HoistVectorTransfers.cpp +++ b/lib/TPP/Transforms/HoistVectorTransfers.cpp @@ -37,15 +37,12 @@ namespace tpp { struct HoistVectorTransferOp : OpRewritePattern { using OpRewritePattern::OpRewritePattern; - HoistVectorTransferOp(MLIRContext *ctx) - : OpRewritePattern(ctx) {} - LogicalResult matchAndRewrite(vector::ContractionOp contractOp, PatternRewriter &rewriter) const override { // Code to hoist vector transfer read before the reduction and k loop if (auto vectorReadOp = contractOp.getOperand(contractOp.getNumOperands()-1).getDefiningOp()) { - auto subviewOp = vectorReadOp->getOperand(0).getDefiningOp(); + auto subviewOp = vectorReadOp->getOperand(0).getDefiningOp(); rewriter.setInsertionPointAfter(subviewOp); auto retriveVectorReadOp = llvm::dyn_cast(vectorReadOp); From 077792b92f0e9982c66798c8c0054c3b6f1c482e Mon Sep 17 00:00:00 2001 From: athangam Date: Wed, 13 Nov 2024 02:11:16 -0800 Subject: [PATCH 12/14] Code refactoring --- lib/TPP/Transforms/HoistVectorTransfers.cpp | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/lib/TPP/Transforms/HoistVectorTransfers.cpp b/lib/TPP/Transforms/HoistVectorTransfers.cpp index 5ae1d695b..b9cf23755 100644 --- a/lib/TPP/Transforms/HoistVectorTransfers.cpp +++ b/lib/TPP/Transforms/HoistVectorTransfers.cpp @@ -41,11 +41,9 @@ struct HoistVectorTransferOp : OpRewritePattern { PatternRewriter &rewriter) const override { // Code to hoist vector transfer read before the reduction and k loop - if (auto vectorReadOp = contractOp.getOperand(contractOp.getNumOperands()-1).getDefiningOp()) { - auto subviewOp = vectorReadOp->getOperand(0).getDefiningOp(); + if (auto retriveVectorReadOp = contractOp.getAcc().getDefiningOp()) { + auto subviewOp = retriveVectorReadOp.getOperand(0).getDefiningOp(); rewriter.setInsertionPointAfter(subviewOp); - - auto retriveVectorReadOp = llvm::dyn_cast(vectorReadOp); auto *cloneVectorReadOp = rewriter.clone(*retriveVectorReadOp); retriveVectorReadOp.replaceAllUsesWith(cloneVectorReadOp); From a20bfe653b1b22c8f4da1090fa31c35ce252d5e1 Mon Sep 17 00:00:00 2001 From: Arun Thangamani Date: Wed, 13 Nov 2024 22:22:21 -0800 Subject: [PATCH 13/14] code re-factoring and addition of few negative test checks --- lib/TPP/Transforms/HoistVectorTransfers.cpp | 182 ++++++++++-------- .../hoist-vector-transfer-brgemm.mlir | 28 ++- ...oist-vector-transfer-operation-brgemm.mlir | 99 ++++++++++ 3 files changed, 222 insertions(+), 87 deletions(-) diff --git a/lib/TPP/Transforms/HoistVectorTransfers.cpp b/lib/TPP/Transforms/HoistVectorTransfers.cpp index b9cf23755..941d49b63 100644 --- a/lib/TPP/Transforms/HoistVectorTransfers.cpp +++ b/lib/TPP/Transforms/HoistVectorTransfers.cpp @@ -39,92 +39,106 @@ struct HoistVectorTransferOp : OpRewritePattern { LogicalResult matchAndRewrite(vector::ContractionOp contractOp, PatternRewriter &rewriter) const override { + + // Check whether the linalg tiling + vector contract pattern matches + auto retriveVectorReadOp = contractOp.getAcc().getDefiningOp(); + if (retriveVectorReadOp == NULL) + return rewriter.notifyMatchFailure(contractOp, "Not a linalg tile + vector contract operation"); + + auto subviewOp = retriveVectorReadOp.getOperand(0).getDefiningOp(); + if (subviewOp == NULL) + return rewriter.notifyMatchFailure(contractOp, "Not a linalg tile + vector contract operation"); + + auto ReductionForOp = llvm::dyn_cast(subviewOp->getNextNode()); + if (ReductionForOp == NULL) + return rewriter.notifyMatchFailure(contractOp, "Not a linalg tile + vector contract operation"); + + auto KForOp = llvm::dyn_cast(ReductionForOp.getBody()->front()); + if (KForOp == NULL) + return rewriter.notifyMatchFailure(contractOp, "Not a linalg tile + vector contract operation"); + + // Move the vector transfer read before the resuction and k loop + rewriter.setInsertionPointAfter(subviewOp); + auto *cloneVectorReadOp = rewriter.clone(*retriveVectorReadOp); + retriveVectorReadOp.replaceAllUsesWith(cloneVectorReadOp); + + // Code to re-create the reduction and k loop with iter args + auto *nextOp = (*cloneVectorReadOp).getNextNode(); + auto oldReductionForOp = llvm::dyn_cast(*nextOp); + auto oldKForOp = llvm::dyn_cast(oldReductionForOp.getBody()->front()); + + auto vectorReadOpValue = (*cloneVectorReadOp).getResult(0); + rewriter.setInsertionPoint(oldReductionForOp); + + auto newReductionForOp = rewriter.create( + oldReductionForOp.getLoc(), oldReductionForOp.getLowerBound(), oldReductionForOp.getUpperBound(), + oldReductionForOp.getStep(),ValueRange{vectorReadOpValue}, + [&](OpBuilder &rewriterNewReductionForOp, Location locNewReductionForOp, Value ivNewReductionForOp, + ValueRange iterArgsNewReductionForOp) { + + auto newKForOp = rewriter.create( + oldKForOp.getLoc(), oldKForOp.getLowerBound(), oldKForOp.getUpperBound(), + oldKForOp.getStep(), iterArgsNewReductionForOp, + [&](OpBuilder &rewriterNewKForOp, Location locNewKForOp, Value ivNewKForOp, + ValueRange iterArgsNewKForOp) { + + mlir::IRMapping mapper; + mapper.map(oldReductionForOp.getInductionVar(), ivNewReductionForOp); + mapper.map(oldKForOp.getInductionVar(), ivNewKForOp); + + for (auto [origArgReduction, newArgReduction] : + llvm::zip(oldReductionForOp.getRegionIterArgs(), iterArgsNewReductionForOp)) { + mapper.map(origArgReduction, newArgReduction); + } + + for (auto [origArgK, newArgK] : + llvm::zip(oldKForOp.getRegionIterArgs(), iterArgsNewKForOp)) { + mapper.map(origArgK, newArgK); + } + + for (auto &op : oldKForOp.getBody()->without_terminator()) { + rewriterNewKForOp.clone(op, mapper); + } + + rewriterNewKForOp.create(locNewKForOp, iterArgsNewKForOp); + + }); + rewriterNewReductionForOp.create(locNewReductionForOp, newKForOp.getResult(0)); + }); + + //Code to hoist vector transfer write after reduction loop and also to update the yield of k loop + auto newKForOp = llvm::dyn_cast(newReductionForOp.getBody()->front()); + Value newcontractOpValue; + mlir::vector::TransferWriteOp vectorWriteOperation; + mlir::Block *bodyBlock = newKForOp.getBody(); + for (auto &op : bodyBlock->getOperations()) { + if (auto vectorContractOp = llvm::dyn_cast(op)) { + vectorContractOp.setOperand(vectorContractOp.getNumOperands()-1, newKForOp.getRegionIterArgs()[0]); + newcontractOpValue = vectorContractOp.getResult(); + } + if (auto yieldOp = llvm::dyn_cast(op)) { + if ( newcontractOpValue != NULL) + yieldOp.setOperand(0, newcontractOpValue); + } + if (auto vectorWriteOp = llvm::dyn_cast(op)) { + vectorWriteOperation = vectorWriteOp; + } + } + + if (vectorWriteOperation != NULL) { + vectorWriteOperation.setOperand(0,newReductionForOp.getResult(0)); + vectorWriteOperation->moveBefore(oldReductionForOp); + } - // Code to hoist vector transfer read before the reduction and k loop - if (auto retriveVectorReadOp = contractOp.getAcc().getDefiningOp()) { - auto subviewOp = retriveVectorReadOp.getOperand(0).getDefiningOp(); - rewriter.setInsertionPointAfter(subviewOp); - auto *cloneVectorReadOp = rewriter.clone(*retriveVectorReadOp); - retriveVectorReadOp.replaceAllUsesWith(cloneVectorReadOp); - - // Code to re-create the reduction and k loop with iter args - auto *nextOp = (*cloneVectorReadOp).getNextNode(); - if (auto oldReductionForOp = llvm::dyn_cast(*nextOp)) { - if (auto oldKForOp = llvm::dyn_cast(oldReductionForOp.getBody()->front())) { - auto vectorReadOpValue = (*cloneVectorReadOp).getResult(0); - rewriter.setInsertionPoint(oldReductionForOp); - - auto newReductionForOp = rewriter.create( - oldReductionForOp.getLoc(), oldReductionForOp.getLowerBound(), oldReductionForOp.getUpperBound(), - oldReductionForOp.getStep(),ValueRange{vectorReadOpValue}, - [&](OpBuilder &rewriterNewReductionForOp, Location locNewReductionForOp, Value ivNewReductionForOp, - ValueRange iterArgsNewReductionForOp) { - - auto newKForOp = rewriter.create( - oldKForOp.getLoc(), oldKForOp.getLowerBound(), oldKForOp.getUpperBound(), - oldKForOp.getStep(), iterArgsNewReductionForOp, - [&](OpBuilder &rewriterNewKForOp, Location locNewKForOp, Value ivNewKForOp, - ValueRange iterArgsNewKForOp) { - - mlir::IRMapping mapper; - mapper.map(oldReductionForOp.getInductionVar(), ivNewReductionForOp); - mapper.map(oldKForOp.getInductionVar(), ivNewKForOp); - - for (auto [origArgReduction, newArgReduction] : - llvm::zip(oldReductionForOp.getRegionIterArgs(), iterArgsNewReductionForOp)) { - mapper.map(origArgReduction, newArgReduction); - } - - for (auto [origArgK, newArgK] : - llvm::zip(oldKForOp.getRegionIterArgs(), iterArgsNewKForOp)) { - mapper.map(origArgK, newArgK); - } - - for (auto &op : oldKForOp.getBody()->without_terminator()) { - rewriterNewKForOp.clone(op, mapper); - } - - rewriterNewKForOp.create(locNewKForOp, iterArgsNewKForOp); - - }); - rewriterNewReductionForOp.create(locNewReductionForOp, newKForOp.getResult(0)); - }); - - //Code to hoist vector transfer write after reduction loop and also to update the yield of k loop - auto newKForOp = llvm::dyn_cast(newReductionForOp.getBody()->front()); - Value newcontractOpValue; - mlir::vector::TransferWriteOp vectorWriteOperation; - mlir::Block *bodyBlock = newKForOp.getBody(); - for (auto &op : bodyBlock->getOperations()) { - if (auto vectorContractOp = llvm::dyn_cast(op)) { - vectorContractOp.setOperand(vectorContractOp.getNumOperands()-1, newKForOp.getRegionIterArgs()[0]); - newcontractOpValue = vectorContractOp.getResult(); - } - if (auto yieldOp = llvm::dyn_cast(op)) { - if ( newcontractOpValue != NULL) - yieldOp.setOperand(0, newcontractOpValue); - } - if (auto vectorWriteOp = llvm::dyn_cast(op)) { - vectorWriteOperation = vectorWriteOp; - } - } - - if (vectorWriteOperation != NULL) { - vectorWriteOperation.setOperand(0,newReductionForOp.getResult(0)); - vectorWriteOperation->moveBefore(oldReductionForOp); - } - - // Erase the old vector contract operation - for (auto result : contractOp->getResults()) { - for (auto *userOp : result.getUsers()) { - userOp->erase(); - } - } - contractOp.erase(); - } - } + // Erase the old vector contract operation + for (auto result : contractOp->getResults()) { + for (auto *userOp : result.getUsers()) { + userOp->erase(); + } } - return success(); + contractOp.erase(); + + return success(); } }; diff --git a/test/Integration/hoist-vector-transfer-brgemm.mlir b/test/Integration/hoist-vector-transfer-brgemm.mlir index cfb9e4dc0..0d4cf34ed 100644 --- a/test/Integration/hoist-vector-transfer-brgemm.mlir +++ b/test/Integration/hoist-vector-transfer-brgemm.mlir @@ -1,9 +1,7 @@ // RUN: tpp-opt %s | tpp-run -e entry --entry-point-result=void -print > %t.1 // RUN: tpp-opt %s --loop-invariant-code-motion --vectorization-pass --loop-invariant-code-motion --hoist-vector-transfer | tpp-run -e entry --entry-point-result=void -print > %t.2 -// RUN: diff %t.1 %t.2 | FileCheck %s --check-prefix=DIFF --allow-empty +// RUN: diff %t.1 %t.2 -// DIFF-NOT: {{.}} -module { memref.global "private" constant @__constant_24x64x64xf32 : memref<24x64x64xf32> = dense<1.000000e+00> {alignment = 64 : i64} func.func @entry(%arg0: memref<8x24x32x64xf32>) -> memref<8x24x32x64xf32> { %c1 = arith.constant 1 : index @@ -34,4 +32,28 @@ module { } return %alloc : memref<8x24x32x64xf32> } + +// ----- + +// RUN: tpp-opt %s | tpp-run -e nomatch --entry-point-result=void -seed 123 -print > %t.1 +// RUN: tpp-opt %s --hoist-vector-transfer | tpp-run -e nomatch --entry-point-result=void -seed 123 -print > %t.2 +// RUN: diff %t.1 %t.2 + +#permA0 = affine_map<(d0, d1, d2) -> (d2, d0)> +#permA1 = affine_map<(d0, d1, d2) -> (d2, d1)> +#permA2 = affine_map<(d0, d1, d2) -> (d0, d1)> + +func.func @nomatch(%arg0: tensor<4x4xf32>, %arg1: tensor<4x4xf32>, %arg2: tensor<4x4xf32>) -> tensor<4x4xf32> { + %c0 = arith.constant 0 : index + %cst = arith.constant 0.000000e+00 : f32 + %0 = vector.transfer_read %arg0[%c0, %c0], %cst {in_bounds = [true, true]} : tensor<4x4xf32>, vector<4x4xf32> + %1 = vector.transfer_read %arg1[%c0, %c0], %cst {in_bounds = [true, true]} : tensor<4x4xf32>, vector<4x4xf32> + %2 = vector.transfer_read %arg2[%c0, %c0], %cst {in_bounds = [true, true]} : tensor<4x4xf32>, vector<4x4xf32> + %3 = vector.contract {indexing_maps = [#permA0, #permA1, #permA2], + iterator_types = ["parallel", "parallel", "reduction"], + kind = #vector.kind} %0, %1, %2 + : vector<4x4xf32>, vector<4x4xf32> into vector<4x4xf32> + %4 = vector.transfer_write %3, %arg2[%c0, %c0] {in_bounds = [true, true]} : vector<4x4xf32>, tensor<4x4xf32> + return %4 : tensor<4x4xf32> } + diff --git a/test/Passes/pass-hoist-vector-transfer-operation-brgemm.mlir b/test/Passes/pass-hoist-vector-transfer-operation-brgemm.mlir index 2b457ac9f..45a68a9d7 100644 --- a/test/Passes/pass-hoist-vector-transfer-operation-brgemm.mlir +++ b/test/Passes/pass-hoist-vector-transfer-operation-brgemm.mlir @@ -275,3 +275,102 @@ module { // CHECK: } // CHECK: return %[[VAL_10]] : memref<8x48x32x32xf32> // CHECK: } + + +// ----- + +// RUN: tpp-opt %s --hoist-vector-transfer --split-input-file | FileCheck %s + + +#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)> +#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)> +#map2 = affine_map<(d0, d1, d2, d3) -> (d1, d2)> +module { + memref.global "private" constant @__constant_24x64x64xf32 : memref<24x64x64xf32> = dense<1.000000e+00> {alignment = 64 : i64} + func.func @nomatch(%arg0: memref<8x24x32x64xf32>) -> memref<8x24x32x64xf32> { + %cst = arith.constant 0.000000e+00 : f32 + %cst_0 = arith.constant dense<0.000000e+00> : vector<32x64xf32> + %c0 = arith.constant 0 : index + %0 = memref.get_global @__constant_24x64x64xf32 : memref<24x64x64xf32> + %alloc = memref.alloc() {alignment = 64 : i64} : memref<8x24x32x64xf32> + scf.forall (%arg1, %arg2) in (8, 24) { + %subview = memref.subview %alloc[%arg1, %arg2, 0, 0] [1, 1, 32, 64] [1, 1, 1, 1] : memref<8x24x32x64xf32> to memref<32x64xf32, strided<[64, 1], offset: ?>> + vector.transfer_write %cst_0, %subview[%c0, %c0] {in_bounds = [true, true]} : vector<32x64xf32>, memref<32x64xf32, strided<[64, 1], offset: ?>> + %subview_1 = memref.subview %arg0[%arg1, 0, 0, 0] [1, 24, 32, 64] [1, 1, 1, 1] : memref<8x24x32x64xf32> to memref<24x32x64xf32, strided<[2048, 64, 1], offset: ?>> + %1 = vector.transfer_read %subview_1[%c0, %c0, %c0], %cst {in_bounds = [true, true, true]} : memref<24x32x64xf32, strided<[2048, 64, 1], offset: ?>>, vector<24x32x64xf32> + %2 = vector.transfer_read %0[%c0, %c0, %c0], %cst {in_bounds = [true, true, true]} : memref<24x64x64xf32>, vector<24x64x64xf32> + %3 = vector.transfer_read %subview[%c0, %c0], %cst {in_bounds = [true, true]} : memref<32x64xf32, strided<[64, 1], offset: ?>>, vector<32x64xf32> + %4 = vector.contract {indexing_maps = [#map, #map1, #map2], iterator_types = ["reduction", "parallel", "parallel", "reduction"], kind = #vector.kind} %1, %2, %3 : vector<24x32x64xf32>, vector<24x64x64xf32> into vector<32x64xf32> + vector.transfer_write %4, %subview[%c0, %c0] {in_bounds = [true, true]} : vector<32x64xf32>, memref<32x64xf32, strided<[64, 1], offset: ?>> + } + return %alloc : memref<8x24x32x64xf32> + } +} + + + +// CHECK: #[[$ATTR_0:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)> +// CHECK: #[[$ATTR_1:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)> +// CHECK: #[[$ATTR_2:.+]] = affine_map<(d0, d1, d2, d3) -> (d1, d2)> +// CHECK-LABEL: memref.global "private" constant @__constant_24x64x64xf32 : memref<24x64x64xf32> = dense<1.000000e+00> {alignment = 64 : i64} + +// CHECK-LABEL: func.func @nomatch( +// CHECK-SAME: %[[VAL_0:.*]]: memref<8x24x32x64xf32>) -> memref<8x24x32x64xf32> { +// CHECK: %[[VAL_1:.*]] = arith.constant 0.000000e+00 : f32 +// CHECK: %[[VAL_2:.*]] = arith.constant dense<0.000000e+00> : vector<32x64xf32> +// CHECK: %[[VAL_3:.*]] = arith.constant 0 : index +// CHECK: %[[VAL_4:.*]] = memref.get_global @__constant_24x64x64xf32 : memref<24x64x64xf32> +// CHECK: %[[VAL_5:.*]] = memref.alloc() {alignment = 64 : i64} : memref<8x24x32x64xf32> +// CHECK: scf.forall (%[[VAL_6:.*]], %[[VAL_7:.*]]) in (8, 24) { +// CHECK: %[[VAL_8:.*]] = memref.subview %[[VAL_5]]{{\[}}%[[VAL_6]], %[[VAL_7]], 0, 0] [1, 1, 32, 64] [1, 1, 1, 1] : memref<8x24x32x64xf32> to memref<32x64xf32, strided<[64, 1], offset: ?>> +// CHECK: vector.transfer_write %[[VAL_2]], %[[VAL_8]]{{\[}}%[[VAL_3]], %[[VAL_3]]] {in_bounds = [true, true]} : vector<32x64xf32>, memref<32x64xf32, strided<[64, 1], offset: ?>> +// CHECK: %[[VAL_9:.*]] = memref.subview %[[VAL_0]]{{\[}}%[[VAL_6]], 0, 0, 0] [1, 24, 32, 64] [1, 1, 1, 1] : memref<8x24x32x64xf32> to memref<24x32x64xf32, strided<[2048, 64, 1], offset: ?>> +// CHECK: %[[VAL_10:.*]] = vector.transfer_read %[[VAL_9]]{{\[}}%[[VAL_3]], %[[VAL_3]], %[[VAL_3]]], %[[VAL_1]] {in_bounds = [true, true, true]} : memref<24x32x64xf32, strided<[2048, 64, 1], offset: ?>>, vector<24x32x64xf32> +// CHECK: %[[VAL_11:.*]] = vector.transfer_read %[[VAL_4]]{{\[}}%[[VAL_3]], %[[VAL_3]], %[[VAL_3]]], %[[VAL_1]] {in_bounds = [true, true, true]} : memref<24x64x64xf32>, vector<24x64x64xf32> +// CHECK: %[[VAL_12:.*]] = vector.transfer_read %[[VAL_8]]{{\[}}%[[VAL_3]], %[[VAL_3]]], %[[VAL_1]] {in_bounds = [true, true]} : memref<32x64xf32, strided<[64, 1], offset: ?>>, vector<32x64xf32> +// CHECK: %[[VAL_13:.*]] = vector.contract {indexing_maps = [#map, #map1, #map2], iterator_types = ["reduction", "parallel", "parallel", "reduction"], kind = #vector.kind} %1, %2, %3 : vector<24x32x64xf32>, vector<24x64x64xf32> into vector<32x64xf32> +// CHECK: vector.transfer_write %[[VAL_13]], %[[VAL_8]]{{\[}}%[[VAL_3]], %[[VAL_3]]] {in_bounds = [true, true]} : vector<32x64xf32>, memref<32x64xf32, strided<[64, 1], offset: ?>> +// CHECK: } +// CHECK: return %[[VAL_5]] : memref<8x24x32x64xf32> +// CHECK: } + +// ----- + +// RUN: tpp-opt %s --hoist-vector-transfer --split-input-file | FileCheck %s + +#map = affine_map<(d0, d1, d2) -> (d0, d2)> +#map1 = affine_map<(d0, d1, d2) -> (d2, d1)> +#map2 = affine_map<(d0, d1, d2) -> (d0, d1)> + +module { + func.func @nomatch1(%arg0: tensor<4x1xf32>, %arg1: tensor<1x64xf32>, %arg2: tensor<4x64xf32>) -> tensor<4x64xf32> { + %c0 = arith.constant 0 : index + %cst = arith.constant 0.000000e+00 : f32 + %0 = vector.transfer_read %arg0[%c0, %c0], %cst {in_bounds = [true, true]} : tensor<4x1xf32>, vector<4x1xf32> + %1 = vector.transfer_read %arg1[%c0, %c0], %cst {in_bounds = [true, true]} : tensor<1x64xf32>, vector<1x64xf32> + %2 = vector.transfer_read %arg2[%c0, %c0], %cst {in_bounds = [true, true]} : tensor<4x64xf32>, vector<4x64xf32> + %3 = vector.contract {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind} %0, %1, %2 : vector<4x1xf32>, vector<1x64xf32> into vector<4x64xf32> + %4 = vector.transfer_write %3, %arg2[%c0, %c0] {in_bounds = [true, true]} : vector<4x64xf32>, tensor<4x64xf32> + return %4 : tensor<4x64xf32> + } +} + + +// CHECK: #[[$ATTR_0:.+]] = affine_map<(d0, d1, d2) -> (d0, d2)> +// CHECK: #[[$ATTR_1:.+]] = affine_map<(d0, d1, d2) -> (d2, d1)> +// CHECK: #[[$ATTR_2:.+]] = affine_map<(d0, d1, d2) -> (d0, d1)> + + +// CHECK-LABEL: func.func @nomatch1( +// CHECK-SAME: %[[VAL_0:.*]]: tensor<4x1xf32>, +// CHECK-SAME: %[[VAL_1:.*]]: tensor<1x64xf32>, +// CHECK-SAME: %[[VAL_2:.*]]: tensor<4x64xf32>) -> tensor<4x64xf32> { +// CHECK: %[[VAL_3:.*]] = arith.constant 0 : index +// CHECK: %[[VAL_4:.*]] = arith.constant 0.000000e+00 : f32 +// CHECK: %[[VAL_5:.*]] = vector.transfer_read %[[VAL_0]]{{\[}}%[[VAL_3]], %[[VAL_3]]], %[[VAL_4]] {in_bounds = [true, true]} : tensor<4x1xf32>, vector<4x1xf32> +// CHECK: %[[VAL_6:.*]] = vector.transfer_read %[[VAL_1]]{{\[}}%[[VAL_3]], %[[VAL_3]]], %[[VAL_4]] {in_bounds = [true, true]} : tensor<1x64xf32>, vector<1x64xf32> +// CHECK: %[[VAL_7:.*]] = vector.transfer_read %[[VAL_2]]{{\[}}%[[VAL_3]], %[[VAL_3]]], %[[VAL_4]] {in_bounds = [true, true]} : tensor<4x64xf32>, vector<4x64xf32> +// CHECK: %[[VAL_8:.*]] = vector.contract {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind} %0, %1, %2 : vector<4x1xf32>, vector<1x64xf32> into vector<4x64xf32> +// CHECK: %[[VAL_9:.*]] = vector.transfer_write %[[VAL_8]], %[[VAL_2]]{{\[}}%[[VAL_3]], %[[VAL_3]]] {in_bounds = [true, true]} : vector<4x64xf32>, tensor<4x64xf32> +// CHECK: return %[[VAL_9]] : tensor<4x64xf32> +// CHECK: } From dc1b0f241f3e35b0bdaf745992ae4c0786250e34 Mon Sep 17 00:00:00 2001 From: Arun Thangamani Date: Wed, 13 Nov 2024 22:27:52 -0800 Subject: [PATCH 14/14] code re-factoring and addition of few negative test checks --- lib/TPP/Transforms/HoistVectorTransfers.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/TPP/Transforms/HoistVectorTransfers.cpp b/lib/TPP/Transforms/HoistVectorTransfers.cpp index 941d49b63..48e07dec8 100644 --- a/lib/TPP/Transforms/HoistVectorTransfers.cpp +++ b/lib/TPP/Transforms/HoistVectorTransfers.cpp @@ -57,7 +57,7 @@ struct HoistVectorTransferOp : OpRewritePattern { if (KForOp == NULL) return rewriter.notifyMatchFailure(contractOp, "Not a linalg tile + vector contract operation"); - // Move the vector transfer read before the resuction and k loop + // Move the vector transfer read before the reduction and k loop rewriter.setInsertionPointAfter(subviewOp); auto *cloneVectorReadOp = rewriter.clone(*retriveVectorReadOp); retriveVectorReadOp.replaceAllUsesWith(cloneVectorReadOp);