From f4e85186eadadc180f1ce69521bc6931eef4c923 Mon Sep 17 00:00:00 2001 From: Hanhan Wang Date: Wed, 24 Aug 2022 18:16:17 -0700 Subject: [PATCH] test2 --- .../Codegen/LLVMCPU/KernelDispatch.cpp | 37 +++++-------------- .../Transforms/ConvertLinalgMatmulToMmt4D.cpp | 2 +- 2 files changed, 10 insertions(+), 29 deletions(-) diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp b/compiler/src/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp index 68fe7709168ca..63d81dda6600e 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp +++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp @@ -1123,6 +1123,7 @@ static LogicalResult setElementwiseGenericOpRootConfig( /*allowIncompleteTile=*/true); // Adjust the number of workload per workgroup to at least 4096. + constexpr int64_t kMinimumWorkload = 4096; auto shape = genericOp.getStaticLoopRanges(); int64_t numWorkload = 1; for (auto en : llvm::enumerate(shape)) { @@ -1137,8 +1138,6 @@ static LogicalResult setElementwiseGenericOpRootConfig( } numWorkload *= size; } - - constexpr int64_t kMinimumWorkload = 4096; for (unsigned currDim = 0; numWorkload < kMinimumWorkload && currDim < numLoops;) { int64_t currSize = flowTileSizes[currDim]; @@ -1153,34 +1152,16 @@ static LogicalResult setElementwiseGenericOpRootConfig( flowTileSizes[currDim] = newSize; } - // Set the next level tile sizes. - SmallVector parallelTileSizes; - auto inputOutputOpOperands = genericOp.getInputAndOutputOperands(); - for (auto map : llvm::enumerate(genericOp.getIndexingMapsArray())) { - // Check the fastest varying dimension of the operand. Set the vector size - // of the corresponding loop to the vector size. - if (map.value().getNumResults() == 0) continue; - auto fastestVaryingDimExpr = - map.value().getResults().back().dyn_cast(); - if (!fastestVaryingDimExpr) continue; - unsigned fastestVaryingDim = fastestVaryingDimExpr.getPosition(); - - // If the indexing map has result it has to be a shaped type. - auto operandType = - inputOutputOpOperands[map.index()]->get().getType().cast(); - int64_t tileSize = getVectorSize(entryPointFn, operandType); - // Vectorization of reductions is driven by input tensors and considering - // the output's fastest varying dim leads to large unroll factors. We limit - // the tile size for this case to 'maxUnrollFactor'. - minTileSizes[fastestVaryingDim] = - std::min(minTileSizes[fastestVaryingDim], 8); - minTileSizes[fastestVaryingDim] = - std::min(minTileSizes[fastestVaryingDim], tileSize); + // Limit the tiling sizes to avoid large unroll factors. Most of the use cases + // are i32 and f32, so we divide the vector size by four by default. This can + // be relaxed once we have better control on vector unrolling. + SmallVector parallelTileSizes(minTileSizes.begin(), + minTileSizes.end()); + int64_t vectorSize = getNativeVectorSizeInBytes(entryPointFn).value() / 4; + for (auto &size : parallelTileSizes) { + size = std::min(size, vectorSize); } - setX86WorkgroupTileSizes(genericOp, flowTileSizes, minTileSizes, - parallelTileSizes, /*allowIncompleteTile=*/true); - // Setting reduction tile sizes is a workaround to kick in peeling transform. // The tiling won't happen because the sizes are zeros. SmallVector reductionTileSizes(numLoops, 0); diff --git a/compiler/src/iree/compiler/Dialect/Flow/Transforms/ConvertLinalgMatmulToMmt4D.cpp b/compiler/src/iree/compiler/Dialect/Flow/Transforms/ConvertLinalgMatmulToMmt4D.cpp index e639a0a062489..0768445b23bdc 100644 --- a/compiler/src/iree/compiler/Dialect/Flow/Transforms/ConvertLinalgMatmulToMmt4D.cpp +++ b/compiler/src/iree/compiler/Dialect/Flow/Transforms/ConvertLinalgMatmulToMmt4D.cpp @@ -342,7 +342,7 @@ LinalgMatmulOpToLinalgMmt4DOpPattern::chooseTileParams(Value lhs, Value rhs, int m0k0n0ForVecMat[3] = {m0k0n0ForMatVec[2], m0k0n0ForMatVec[1], m0k0n0ForMatVec[0]}; return Mmt4DTileParams(m0k0n0ForVecMat, comment + ", vector*matrix"); - }else { + } else { return Mmt4DTileParams(m0k0n0, comment); } };