test2

iree-org · Aug 25, 2022 · f4e8518 · f4e8518
1 parent 0f58a76
commit f4e8518
Show file tree

Hide file tree

Showing 2 changed files with 10 additions and 29 deletions.
diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp b/compiler/src/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp
@@ -1123,6 +1123,7 @@ static LogicalResult setElementwiseGenericOpRootConfig(
                                           /*allowIncompleteTile=*/true);
 
   // Adjust the number of workload per workgroup to at least 4096.
+  constexpr int64_t kMinimumWorkload = 4096;
   auto shape = genericOp.getStaticLoopRanges();
   int64_t numWorkload = 1;
   for (auto en : llvm::enumerate(shape)) {
@@ -1137,8 +1138,6 @@ static LogicalResult setElementwiseGenericOpRootConfig(
     }
     numWorkload *= size;
   }
-
-  constexpr int64_t kMinimumWorkload = 4096;
   for (unsigned currDim = 0;
        numWorkload < kMinimumWorkload && currDim < numLoops;) {
     int64_t currSize = flowTileSizes[currDim];
@@ -1153,34 +1152,16 @@ static LogicalResult setElementwiseGenericOpRootConfig(
     flowTileSizes[currDim] = newSize;
   }
 
-  // Set the next level tile sizes.
-  SmallVector<int64_t> parallelTileSizes;
-  auto inputOutputOpOperands = genericOp.getInputAndOutputOperands();
-  for (auto map : llvm::enumerate(genericOp.getIndexingMapsArray())) {
-    // Check the fastest varying dimension of the operand. Set the vector size
-    // of the corresponding loop to the vector size.
-    if (map.value().getNumResults() == 0) continue;
-    auto fastestVaryingDimExpr =
-        map.value().getResults().back().dyn_cast<AffineDimExpr>();
-    if (!fastestVaryingDimExpr) continue;
-    unsigned fastestVaryingDim = fastestVaryingDimExpr.getPosition();
-
-    // If the indexing map has result it has to be a shaped type.
-    auto operandType =
-        inputOutputOpOperands[map.index()]->get().getType().cast<ShapedType>();
-    int64_t tileSize = getVectorSize(entryPointFn, operandType);
-    // Vectorization of reductions is driven by input tensors and considering
-    // the output's fastest varying dim leads to large unroll factors. We limit
-    // the tile size for this case to 'maxUnrollFactor'.
-    minTileSizes[fastestVaryingDim] =
-        std::min<int64_t>(minTileSizes[fastestVaryingDim], 8);
-    minTileSizes[fastestVaryingDim] =
-        std::min<int64_t>(minTileSizes[fastestVaryingDim], tileSize);
+  // Limit the tiling sizes to avoid large unroll factors. Most of the use cases
+  // are i32 and f32, so we divide the vector size by four by default. This can
+  // be relaxed once we have better control on vector unrolling.
+  SmallVector<int64_t> parallelTileSizes(minTileSizes.begin(),
+                                         minTileSizes.end());
+  int64_t vectorSize = getNativeVectorSizeInBytes(entryPointFn).value() / 4;
+  for (auto &size : parallelTileSizes) {
+    size = std::min<int64_t>(size, vectorSize);
   }
 
-  setX86WorkgroupTileSizes(genericOp, flowTileSizes, minTileSizes,
-                           parallelTileSizes, /*allowIncompleteTile=*/true);
-
   // Setting reduction tile sizes is a workaround to kick in peeling transform.
   // The tiling won't happen because the sizes are zeros.
   SmallVector<int64_t> reductionTileSizes(numLoops, 0);

diff --git a/compiler/src/iree/compiler/Dialect/Flow/Transforms/ConvertLinalgMatmulToMmt4D.cpp b/compiler/src/iree/compiler/Dialect/Flow/Transforms/ConvertLinalgMatmulToMmt4D.cpp
@@ -342,7 +342,7 @@ LinalgMatmulOpToLinalgMmt4DOpPattern::chooseTileParams(Value lhs, Value rhs,
       int m0k0n0ForVecMat[3] = {m0k0n0ForMatVec[2], m0k0n0ForMatVec[1],
                                 m0k0n0ForMatVec[0]};
       return Mmt4DTileParams(m0k0n0ForVecMat, comment + ", vector*matrix");
-    }else {
+    } else {
       return Mmt4DTileParams(m0k0n0, comment);
     }
   };