iree-org · hanhanW · Aug 29, 2022 · Aug 22, 2022 · Aug 25, 2022 · Aug 26, 2022
@@ -1106,11 +1106,102 @@ static LogicalResult setTransposeLikeOpRootConfig(func::FuncOp entryPointFn,
                                                tileSizes, passPipeline);
 }
 
+/// Sets elementwise dispatches to use peeling approach. It scales the number of
+/// workload per workgroup to a larger number, which prevents runtime overheads
+/// from tiny dispatches.
+static LogicalResult setElementwiseGenericOpRootConfig(
+    func::FuncOp entryPointFn, linalg::GenericOp genericOp) {
+  if (getLoweringConfig(genericOp)) {
+    return success();
+  }
+
+  unsigned numLoops = genericOp.getNumLoops();
+  if (numLoops == 0) return success();
+  if (!linalg::isElementwise(genericOp)) return success();
+
+  // Set the flow level tiling to the default.
+  SmallVector<int64_t> minTileSizes =
+      getMinTilingSizesForEachDim(entryPointFn, genericOp);
+  SmallVector<int64_t> maxTileSizes(numLoops, defaultWorkgroupTileSize);
+  SmallVector<int64_t> flowTileSizes =
+      getDefaultDistributedLevelTileSizes(genericOp, minTileSizes, maxTileSizes,
+                                          /*allowIncompleteTile=*/true);
+
+  // Adjust the number of workload per workgroup to at least 4096. This
+  // prevents the runtime overheads domiating the execution time. The number is
+  // derived from experimients. We should be able to make it related to target.
+  constexpr int64_t kMinimumWorkload = 4096;
+  auto shape = genericOp.getStaticLoopRanges();
+  int64_t numWorkload = 1;
+  for (auto en : llvm::enumerate(shape)) {
+    int64_t size = en.value();
+    if (size == ShapedType::kDynamicSize) {
+      numWorkload = ShapedType::kDynamicSize;
+      break;
+    }
+    int index = en.index();
+    if (flowTileSizes[index]) {
+      size = flowTileSizes[index];
+    }
+    numWorkload *= size;
+  }
+  for (unsigned currDim = 0;
+       numWorkload < kMinimumWorkload && currDim < numLoops;) {
+    int64_t currSize = flowTileSizes[currDim];
+    if (currSize == shape[currDim] || currSize == 0 ||
+        shape[currDim] == ShapedType::kDynamicSize ||
+        numWorkload == ShapedType::kDynamicSize) {
+      currDim++;
+      continue;
+    }
+    int64_t newSize = std::min<int64_t>(currSize * 2, shape[currDim]);
+    numWorkload = numWorkload / currSize * newSize;
+    flowTileSizes[currDim] = newSize;
+  }
+
+  // Adjust tiling sizes of vector levels to avoid large unroll factors.
+  SmallVector<int64_t> vecTileSizes(minTileSizes.begin(), minTileSizes.end());
+  for (auto operand : genericOp.getOutputOperands()) {
+    constexpr int64_t kMaxUnrollFactor = 8;
+    AffineMap map = genericOp.getTiedIndexingMap(operand);
+    int64_t vecSize = getVectorSize(entryPointFn, operand->get().getType());
+    int64_t currSize = 1;
+    for (auto dimExpr : llvm::reverse(map.getResults().drop_back())) {
+      unsigned pos = dimExpr.cast<AffineDimExpr>().getPosition();
+      if (vecTileSizes[pos] * currSize > vecSize * kMaxUnrollFactor) {
+        vecTileSizes[pos] = 1;
+        currSize = vecSize * kMaxUnrollFactor;
+      }
+    }
+    int fastestPos =
+        map.getResults().back().cast<AffineDimExpr>().getPosition();
+    vecTileSizes[fastestPos] =
+        std::min<int64_t>(vecTileSizes[fastestPos], kMaxUnrollFactor);
+  }
+
+  // Setting reduction tile sizes is a workaround to kick in peeling transform.
+  // The tiling won't happen because the sizes are zeros.
+  SmallVector<int64_t> zeros(numLoops, 0);
+
+  TileSizesListType tileSizes;
+  tileSizes.push_back(flowTileSizes);
+  tileSizes.push_back(vecTileSizes);
+  tileSizes.push_back(zeros);
+
+  auto passPipeline =
+      genericOp.hasTensorSemantics()
+          ? DispatchLoweringPassPipeline::CPUDoubleTilingPeelingExpert
+          : DispatchLoweringPassPipeline::CPUBufferOpsTileAndVectorize;
+  return setOpConfigAndEntryPointFnTranslation(entryPointFn, genericOp,
+                                               tileSizes, passPipeline);
+}
+
 /// Sets the lowering configuration for a generic op to use
 /// CPUDoubleTilingExpert pipeline.
 static LogicalResult setRootConfig(func::FuncOp entryPointFn,
                                    linalg::GenericOp genericOp) {
   if (failed(setTransposeLikeOpRootConfig(entryPointFn, genericOp)) ||
+      failed(setElementwiseGenericOpRootConfig(entryPointFn, genericOp)) ||
       failed(setDefaultGenericOpRootConfig(entryPointFn, genericOp))) {
     return failure();
   }

@@ -290,6 +290,7 @@ void addCPUBufferOpsTileAndVectorizePipeline(OpPassManager &passManager) {
     LinalgSingleTilingExpertPassOptions options;
     options.tilingLevel =
         static_cast<int64_t>(StrategyTilingLevel::ParallelTiles);
+    options.peel = true;
     options.vectorize = true;
     nestedModulePM.addNestedPass<func::FuncOp>(
         createLinalgSingleTilingExpertPass(options));

@@ -217,7 +217,7 @@ hal.executable private @add {
     }
   }
 }
-//  CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[32, 32], [1, 4], [0, 0]]>
+//  CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[64, 64], [1, 4], [0, 0]]>
 //  CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<CPUDoubleTilingPeelingExpert>
 //      CHECK: hal.executable.export public @add
 // CHECK-SAME:     translation_info = #[[TRANSLATION]]
@@ -275,7 +275,7 @@ hal.executable private @add4D  {
   }
 }
 
-//  CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[0, 32, 32, 32], [1, 1, 1, 4], [0, 0, 0, 0]]>
+//  CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[0, 64, 64, 64], [1, 1, 1, 4], [0, 0, 0, 0]]>
 //  CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<CPUDoubleTilingPeelingExpert>
 //      CHECK: hal.executable.export public @add4D
 // CHECK-SAME:     translation_info = #[[TRANSLATION]]
@@ -316,8 +316,8 @@ hal.executable private @add_static {
     }
   }
 }
-//  CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[0, 8, 16, 32], [1, 1, 1, 4], [0, 0, 0, 0]]>
-//  CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<CPUDoubleTilingExpert>
+//  CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[0, 8, 16, 64], [1, 1, 1, 4], [0, 0, 0, 0]]>
+//  CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<CPUDoubleTilingPeelingExpert>
 //      CHECK: hal.executable.export public @add_static
 // CHECK-SAME:     translation_info = #[[TRANSLATION]]
 //      CHECK: linalg.generic
@@ -408,7 +408,7 @@ hal.executable @copy_op_dynamic {
   }
 }
 
-//  CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[32, 32], [1, 1], [0, 0]{{\]}}>
+//  CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[64, 64], [1, 4], [0, 0]{{\]}}>
 //  CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<CPUBufferOpsTileAndVectorize>
 //      CHECK: hal.executable.export public @copy_op_dynamic
 // CHECK-SAME:     translation_info = #[[TRANSLATION]]
@@ -738,8 +738,8 @@ hal.executable private @generic_static {
     }
   }
 }
-//  CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[16, 32], [16, 16], [0, 0]]>
-//  CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<CPUDoubleTilingExpert>
+//  CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[16, 96], [16, 8], [0, 0]]>
+//  CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<CPUDoubleTilingPeelingExpert>
 //      CHECK: hal.executable.export public @generic_static
 // CHECK-SAME:     translation_info = #[[TRANSLATION]]
 //      CHECK:   linalg.generic
@@ -1088,8 +1088,8 @@ hal.executable private @generic_unit_dims_dynamic {
     }
   }
 }
-//  CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[0, 0, 0, 0, 32, 32, 0, 32], [0, 1, 0, 0, 1, 1, 0, 1], [0, 0, 0, 0, 0, 0, 0, 0]{{\]}}>
-//  CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<CPUDoubleTilingExpert>
+//  CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[0, 0, 0, 0, 64, 64, 0, 64], [1, 1, 1, 1, 1, 1, 1, 4], [0, 0, 0, 0, 0, 0, 0, 0]{{\]}}>
+//  CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<CPUDoubleTilingPeelingExpert>
 //      CHECK: hal.executable.export public @generic_unit_dims_dynamic
 // CHECK-SAME:     translation_info = #[[TRANSLATION]]
 //      CHECK: linalg.generic