Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Optimize tiling sizes heuristics for elementwise dispatches. #10179

Merged
merged 7 commits into from
Aug 29, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
91 changes: 91 additions & 0 deletions compiler/src/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1106,11 +1106,102 @@ static LogicalResult setTransposeLikeOpRootConfig(func::FuncOp entryPointFn,
tileSizes, passPipeline);
}

/// Sets elementwise dispatches to use peeling approach. It scales the number of
/// workload per workgroup to a larger number, which prevents runtime overheads
/// from tiny dispatches.
static LogicalResult setElementwiseGenericOpRootConfig(
hanhanW marked this conversation as resolved.
Show resolved Hide resolved
func::FuncOp entryPointFn, linalg::GenericOp genericOp) {
if (getLoweringConfig(genericOp)) {
return success();
}

unsigned numLoops = genericOp.getNumLoops();
if (numLoops == 0) return success();
if (!linalg::isElementwise(genericOp)) return success();

// Set the flow level tiling to the default.
SmallVector<int64_t> minTileSizes =
getMinTilingSizesForEachDim(entryPointFn, genericOp);
SmallVector<int64_t> maxTileSizes(numLoops, defaultWorkgroupTileSize);
SmallVector<int64_t> flowTileSizes =
getDefaultDistributedLevelTileSizes(genericOp, minTileSizes, maxTileSizes,
/*allowIncompleteTile=*/true);

// Adjust the number of workload per workgroup to at least 4096. This
// prevents the runtime overheads domiating the execution time. The number is
// derived from experimients. We should be able to make it related to target.
constexpr int64_t kMinimumWorkload = 4096;
auto shape = genericOp.getStaticLoopRanges();
int64_t numWorkload = 1;
for (auto en : llvm::enumerate(shape)) {
int64_t size = en.value();
if (size == ShapedType::kDynamicSize) {
numWorkload = ShapedType::kDynamicSize;
break;
}
int index = en.index();
if (flowTileSizes[index]) {
size = flowTileSizes[index];
}
numWorkload *= size;
}
for (unsigned currDim = 0;
numWorkload < kMinimumWorkload && currDim < numLoops;) {
int64_t currSize = flowTileSizes[currDim];
if (currSize == shape[currDim] || currSize == 0 ||
shape[currDim] == ShapedType::kDynamicSize ||
numWorkload == ShapedType::kDynamicSize) {
currDim++;
continue;
}
int64_t newSize = std::min<int64_t>(currSize * 2, shape[currDim]);
numWorkload = numWorkload / currSize * newSize;
flowTileSizes[currDim] = newSize;
}

// Adjust tiling sizes of vector levels to avoid large unroll factors.
SmallVector<int64_t> vecTileSizes(minTileSizes.begin(), minTileSizes.end());
for (auto operand : genericOp.getOutputOperands()) {
constexpr int64_t kMaxUnrollFactor = 8;
AffineMap map = genericOp.getTiedIndexingMap(operand);
int64_t vecSize = getVectorSize(entryPointFn, operand->get().getType());
int64_t currSize = 1;
for (auto dimExpr : llvm::reverse(map.getResults().drop_back())) {
unsigned pos = dimExpr.cast<AffineDimExpr>().getPosition();
if (vecTileSizes[pos] * currSize > vecSize * kMaxUnrollFactor) {
vecTileSizes[pos] = 1;
currSize = vecSize * kMaxUnrollFactor;
}
}
int fastestPos =
map.getResults().back().cast<AffineDimExpr>().getPosition();
vecTileSizes[fastestPos] =
std::min<int64_t>(vecTileSizes[fastestPos], kMaxUnrollFactor);
}

// Setting reduction tile sizes is a workaround to kick in peeling transform.
// The tiling won't happen because the sizes are zeros.
SmallVector<int64_t> zeros(numLoops, 0);

TileSizesListType tileSizes;
tileSizes.push_back(flowTileSizes);
tileSizes.push_back(vecTileSizes);
tileSizes.push_back(zeros);

hanhanW marked this conversation as resolved.
Show resolved Hide resolved
auto passPipeline =
genericOp.hasTensorSemantics()
? DispatchLoweringPassPipeline::CPUDoubleTilingPeelingExpert
: DispatchLoweringPassPipeline::CPUBufferOpsTileAndVectorize;
return setOpConfigAndEntryPointFnTranslation(entryPointFn, genericOp,
tileSizes, passPipeline);
}

/// Sets the lowering configuration for a generic op to use
/// CPUDoubleTilingExpert pipeline.
static LogicalResult setRootConfig(func::FuncOp entryPointFn,
linalg::GenericOp genericOp) {
if (failed(setTransposeLikeOpRootConfig(entryPointFn, genericOp)) ||
failed(setElementwiseGenericOpRootConfig(entryPointFn, genericOp)) ||
failed(setDefaultGenericOpRootConfig(entryPointFn, genericOp))) {
return failure();
}
Expand Down
1 change: 1 addition & 0 deletions compiler/src/iree/compiler/Codegen/LLVMCPU/Passes.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -290,6 +290,7 @@ void addCPUBufferOpsTileAndVectorizePipeline(OpPassManager &passManager) {
LinalgSingleTilingExpertPassOptions options;
options.tilingLevel =
static_cast<int64_t>(StrategyTilingLevel::ParallelTiles);
options.peel = true;
options.vectorize = true;
nestedModulePM.addNestedPass<func::FuncOp>(
createLinalgSingleTilingExpertPass(options));
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -217,7 +217,7 @@ hal.executable private @add {
}
}
}
// CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[32, 32], [1, 4], [0, 0]]>
// CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[64, 64], [1, 4], [0, 0]]>
// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<CPUDoubleTilingPeelingExpert>
// CHECK: hal.executable.export public @add
// CHECK-SAME: translation_info = #[[TRANSLATION]]
Expand Down Expand Up @@ -275,7 +275,7 @@ hal.executable private @add4D {
}
}

// CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[0, 32, 32, 32], [1, 1, 1, 4], [0, 0, 0, 0]]>
// CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[0, 64, 64, 64], [1, 1, 1, 4], [0, 0, 0, 0]]>
// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<CPUDoubleTilingPeelingExpert>
// CHECK: hal.executable.export public @add4D
// CHECK-SAME: translation_info = #[[TRANSLATION]]
Expand Down Expand Up @@ -316,8 +316,8 @@ hal.executable private @add_static {
}
}
}
// CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[0, 8, 16, 32], [1, 1, 1, 4], [0, 0, 0, 0]]>
// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<CPUDoubleTilingExpert>
// CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[0, 8, 16, 64], [1, 1, 1, 4], [0, 0, 0, 0]]>
// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<CPUDoubleTilingPeelingExpert>
// CHECK: hal.executable.export public @add_static
// CHECK-SAME: translation_info = #[[TRANSLATION]]
// CHECK: linalg.generic
Expand Down Expand Up @@ -408,7 +408,7 @@ hal.executable @copy_op_dynamic {
}
}

// CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[32, 32], [1, 1], [0, 0]{{\]}}>
// CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[64, 64], [1, 4], [0, 0]{{\]}}>
// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<CPUBufferOpsTileAndVectorize>
// CHECK: hal.executable.export public @copy_op_dynamic
// CHECK-SAME: translation_info = #[[TRANSLATION]]
Expand Down Expand Up @@ -738,8 +738,8 @@ hal.executable private @generic_static {
}
}
}
// CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[16, 32], [16, 16], [0, 0]]>
// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<CPUDoubleTilingExpert>
// CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[16, 96], [16, 8], [0, 0]]>
// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<CPUDoubleTilingPeelingExpert>
// CHECK: hal.executable.export public @generic_static
// CHECK-SAME: translation_info = #[[TRANSLATION]]
// CHECK: linalg.generic
Expand Down Expand Up @@ -1088,8 +1088,8 @@ hal.executable private @generic_unit_dims_dynamic {
}
}
}
// CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[0, 0, 0, 0, 32, 32, 0, 32], [0, 1, 0, 0, 1, 1, 0, 1], [0, 0, 0, 0, 0, 0, 0, 0]{{\]}}>
// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<CPUDoubleTilingExpert>
// CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[0, 0, 0, 0, 64, 64, 0, 64], [1, 1, 1, 1, 1, 1, 1, 4], [0, 0, 0, 0, 0, 0, 0, 0]{{\]}}>
// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<CPUDoubleTilingPeelingExpert>
// CHECK: hal.executable.export public @generic_unit_dims_dynamic
// CHECK-SAME: translation_info = #[[TRANSLATION]]
// CHECK: linalg.generic
Expand Down