Skip to content

Commit

Permalink
Fix transform dialect test and cleanup some usages. (#10782)
Browse files Browse the repository at this point in the history
This revision cherry-picks LLVM commit
d8cab3f407070c6d80396553ce024e17a0659b04 and manually resolves
conflicts. This is necessary to fix a bug in the transform dialect.

This allows reactivating the softmax gpu example.
  • Loading branch information
nicolasvasilache authored Oct 25, 2022
1 parent 1958c15 commit 95ed505
Show file tree
Hide file tree
Showing 10 changed files with 134 additions and 146 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -101,10 +101,10 @@ DiagnosedSilenceableFailure transform_dialect::ApplyPatternsOp::applyToOne(
Operation *target, SmallVectorImpl<Operation *> &results,
transform::TransformState &state) {
if (!target->hasTrait<OpTrait::IsIsolatedFromAbove>()) {
target->emitOpError(
return mlir::emitDefiniteFailure(
target,
"applies only to isolated-from-above targets because it needs to apply "
"patterns greedily");
return DiagnosedSilenceableFailure(reportUnknownTransformError(target));
}
MLIRContext *ctx = target->getContext();
RewritePatternSet patterns(ctx);
Expand All @@ -121,8 +121,13 @@ DiagnosedSilenceableFailure transform_dialect::ApplyPatternsOp::applyToOne(
LogicalResult result = applyPatternsAndFoldGreedily(
target, std::move(patterns), config, &listener);
LogicalResult listenerResult = listener.checkErrorState();
if (failed(result) || failed(listenerResult))
return DiagnosedSilenceableFailure(reportUnknownTransformError(target));
if (failed(result)) {
return mlir::emitDefiniteFailure(target,
"greedy pattern application failed");
}
if (failed(listenerResult))
return mlir::emitDefiniteFailure(target, "listener tracking failed");

results.assign({target});
return DiagnosedSilenceableFailure(success());
}
Expand Down Expand Up @@ -207,10 +212,10 @@ DiagnosedSilenceableFailure transform_dialect::IREEBufferizeOp::apply(
if (payload.size() != 1 ||
!isa<ModuleOp, HAL::ExecutableOp, HAL::ExecutableVariantOp>(
payload.front())) {
state.getTopLevel()->emitOpError(
return mlir::emitDefiniteFailure(
state.getTopLevel(),
"requires exactly a single HAL::ExecutableOp or "
"HAL::ExecutableVariantOp target op.");
return DiagnosedSilenceableFailure(failure());
}
PassManager pm(getContext());
// Bufferize the dispatch.
Expand All @@ -237,9 +242,14 @@ DiagnosedSilenceableFailure transform_dialect::IREEBufferizeOp::apply(
}
return WalkResult::advance();
});

if (res.wasInterrupted())
return DiagnosedSilenceableFailure::definiteFailure();

results.set(getOperation()->getOpResult(0), payload.front());
return DiagnosedSilenceableFailure(failure(res.wasInterrupted()));
return DiagnosedSilenceableFailure::success();
}

/// Populate the workgroup_count region of `dispatchOp`.
/// For now, this only supports constant index ops and empty workload operands.
/// Assumes the HAL::ExecutableExportOp is built with an empty region.
Expand Down Expand Up @@ -437,20 +447,21 @@ transform_dialect::ForeachThreadToWorkgroupOp::applyToOne(
func::FuncOp target, SmallVectorImpl<Operation *> &results,
transform::TransformState &state) {
if (!isa<HAL::ExecutableOp, HAL::ExecutableVariantOp>(state.getTopLevel())) {
state.getTopLevel()->emitOpError(
return mlir::emitDefiniteFailure(
state.getTopLevel(),
"requires HAL::ExecutableOp or HAL::ExecutableVariantOp toplevel "
"to attach the workgroup size information to a nested "
"ExecutableExportOp");
return DiagnosedSilenceableFailure(reportUnknownTransformError(target));
}

IREE::HAL::ExecutableExportOp exportOp;
state.getTopLevel()->walk([&](IREE::HAL::ExecutableExportOp op) {
if (op.getSymName() == target.getName()) exportOp = op;
});
if (!exportOp) {
state.getTopLevel()->emitOpError("no IREE::HAL::ExecutableExportOp found");
return DiagnosedSilenceableFailure(reportUnknownTransformError(target));
results.assign(1, nullptr);
return mlir::emitSilenceableFailure(
target, "no IREE::HAL::ExecutableExportOp found");
}

scf::ForeachThreadOp topLevelForeachThreadOp;
Expand All @@ -463,18 +474,19 @@ transform_dialect::ForeachThreadToWorkgroupOp::applyToOne(
});

if (walkResult.wasInterrupted()) {
state.getTopLevel()->emitOpError(
"could not find a unique topLevel scf.foreach_thread");
return DiagnosedSilenceableFailure(reportUnknownTransformError(target));
results.assign(1, nullptr);
return mlir::emitSilenceableFailure(
target, "could not find a unique topLevel scf.foreach_thread");
}

SimplePatternRewriter rewriter(topLevelForeachThreadOp);
if (failed(rewriteForeachThreadToWorkgroup(topLevelForeachThreadOp, exportOp,
rewriter)))
return DiagnosedSilenceableFailure(reportUnknownTransformError(target));
rewriter))) {
return mlir::emitDefiniteFailure(target,
"rewriteForeachThreadToWorkgroup failed");
}

results.assign({target});

return DiagnosedSilenceableFailure(success());
}

Expand Down Expand Up @@ -560,10 +572,7 @@ void transform_dialect::TileToForeachThreadAndWorkgroupCountRegion::getEffects(
transform::onlyReadsHandle(getTileSizes(), effects);
transform::onlyReadsHandle(getNumThreads(), effects);
transform::producesHandle(getResults(), effects);
effects.emplace_back(MemoryEffects::Read::get(),
transform::PayloadIRResource::get());
effects.emplace_back(MemoryEffects::Write::get(),
transform::PayloadIRResource::get());
transform::modifiesPayload(effects);
}

DiagnosedSilenceableFailure
Expand All @@ -575,25 +584,23 @@ transform_dialect::TileToForeachThreadAndWorkgroupCountRegion::apply(
auto funcOp = targetOps.front()->getParentOfType<func::FuncOp>();
FailureOr<IREE::HAL::ExecutableExportOp> exportOp = getEntryPoint(funcOp);
if (failed(exportOp)) {
state.getTopLevel()->emitOpError("couldn't find export op for func");
return DiagnosedSilenceableFailure(reportUnknownTransformError(funcOp));
return mlir::emitDefiniteFailure(state.getTopLevel(),
"couldn't find export op for func");
}

SmallVector<OpFoldResult> mixedTileSizes = getMixedTileSizes();
if (mixedTileSizes.empty()) {
exportOp.value()->emitOpError("require tile sizes to be specified");
return DiagnosedSilenceableFailure(
reportUnknownTransformError(exportOp.value()));
return mlir::emitDefiniteFailure(exportOp.value(),
"require tile sizes to be specified");
}

/// Lower the workgroup count region in keeping with the way dispatch
/// regions are created by default in IREEs compilation flow.
IRRewriter rewriter(getContext());
if (failed(lowerWorkgroupCountComputingRegion(rewriter, exportOp.value(),
mixedTileSizes))) {
exportOp.value()->emitOpError("failed to lower workgroup count region");
return DiagnosedSilenceableFailure(
reportUnknownTransformError(exportOp.value()));
return mlir::emitDefiniteFailure(exportOp.value(),
"failed to lower workgroup count region");
}

ArrayRef<Operation *> targets = state.getPayloadOps(getTarget());
Expand All @@ -607,11 +614,16 @@ transform_dialect::TileToForeachThreadAndWorkgroupCountRegion::apply(
targets, getMixedNumThreads(), getMixedTileSizes(), getThreadDimMapping(),
tileOps, tiledOps);

if (!diag.succeeded()) return diag;
if (!diag.succeeded()) {
transformResults.set(getForeachThreadOp().cast<OpResult>(),
SmallVector<mlir::Operation *>{});
transformResults.set(getTiledOp().cast<OpResult>(),
SmallVector<mlir::Operation *>{});
return diag;
}

transformResults.set(getForeachThreadOp().cast<OpResult>(), tileOps);
transformResults.set(getTiledOp().cast<OpResult>(), tiledOps);

return DiagnosedSilenceableFailure(success());
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -238,12 +238,11 @@ transform_dialect::VectorToWarpExecuteOnLane0Op::applyToOne(
scf::IfOp target, SmallVectorImpl<Operation *> &results,
transform::TransformState &state) {
if (!isa<HAL::ExecutableOp, HAL::ExecutableVariantOp>(state.getTopLevel())) {
state.getTopLevel()->emitOpError(
"requires HAL::ExecutableOp or HAL::ExecutableVariantOp toplevel so "
"that IR is properly isolated. This is required so we can safely "
"inspect the HAL::ExecutableExportOp under multi-threaded pass "
"assumptions.");
return DiagnosedSilenceableFailure(reportUnknownTransformError(target));
return emitDefaultSilenceableFailure(state.getTopLevel())
<< "requires HAL::ExecutableOp or HAL::ExecutableVariantOp toplevel "
"so that IR is properly isolated. This is required so we can "
"safely inspect the HAL::ExecutableExportOp under multi-threaded "
"pass assumptions.";
}

auto halExecutableVariantOp =
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -569,7 +569,7 @@ forgetUnnecessaryHandles(transform::TransformState &state,
continue;

for (Operation *payload : state.getPayloadOps(operand)) {
if (seen.contains(payload))
if (!payload || seen.contains(payload))
continue;
SmallVector<Value> allHandles;
(void)state.getHandlesForPayloadOp(payload, allHandles);
Expand All @@ -592,7 +592,7 @@ forgetUnnecessaryHandles(transform::TransformState &state,
if (!result.getUses().empty())
continue;
for (Operation *payload : state.getPayloadOps(result)) {
if (seen.contains(payload))
if (!payload || seen.contains(payload))
continue;
listener->removeMappings(payload);
seen.insert(payload);
Expand Down
72 changes: 37 additions & 35 deletions tests/transform_dialect/cuda/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
# Tests for end-to-end IREE support of entire models or their close derivatives.

load("//build_tools/bazel:build_defs.oss.bzl", "iree_cmake_extra_content")
#load("//build_tools/bazel:iree_lit_test.bzl", "iree_lit_test_suite")
load("//build_tools/bazel:iree_lit_test.bzl", "iree_lit_test_suite")

package(
features = ["layering_check"],
Expand All @@ -23,37 +23,39 @@ endif()
inline = True,
)

# TODO: re-enable the tests
# iree_lit_test_suite(
# name = "lit",
# srcs = [
# "reduction.mlir",
# "softmax.mlir",
# ],
# cfg = "//tests:lit.cfg.py",
# # transform dialect spec files are MLIR files that specify a transformation,
# # they need to be included as data.
# data = [
# "reduction_codegen_spec.mlir",
# "softmax_codegen_spec.mlir",
# # FIXME: This cannot be retired yet as there is some writeonly vs readwrite
# # issue and we even end up emitting out of bounds accesses.
# "softmax_dispatch_spec.mlir",
# "softmax_fused_codegen_spec.mlir",
# ],
# tags = [
# # CUDA cuInit fails with sanitizer on.
# "noasan",
# "nomsan",
# "notsan",
# "noubsan",
# "requires-gpu-nvidia",
# "driver=cuda",
# ],
# tools = [
# "//tools:iree-compile",
# "//tools:iree-opt",
# "//tools:iree-run-module",
# "@llvm-project//llvm:FileCheck",
# ],
# )
iree_lit_test_suite(
name = "lit",
srcs = [
"reduction.mlir",
"softmax.mlir",
],
cfg = "//tests:lit.cfg.py",
# transform dialect spec files are MLIR files that specify a transformation,
# they need to be included as data.
data = [
"reduction_codegen_spec.mlir",
"softmax_codegen_spec.mlir",
#
# FIXME: Fused codegen must be used with the custom dispatch region formation
# because IREE's pulls in tensor.empty by default.
# This results in threadprivate allocations and prevents vector distribution.
#
"softmax_dispatch_spec.mlir",
"softmax_fused_codegen_spec.mlir",
],
tags = [
# CUDA cuInit fails with sanitizer on.
"noasan",
"nomsan",
"notsan",
"noubsan",
"requires-gpu-nvidia",
"driver=cuda",
],
tools = [
"//tools:iree-compile",
"//tools:iree-opt",
"//tools:iree-run-module",
"@llvm-project//llvm:FileCheck",
],
)
25 changes: 25 additions & 0 deletions tests/transform_dialect/cuda/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -14,4 +14,29 @@ if(NOT IREE_HAL_DRIVER_CUDA OR NOT IREE_TARGET_BACKEND_CUDA)
return()
endif()

iree_lit_test_suite(
NAME
lit
SRCS
"reduction.mlir"
"softmax.mlir"
TOOLS
FileCheck
iree-compile
iree-opt
iree-run-module
DATA
reduction_codegen_spec.mlir
softmax_codegen_spec.mlir
softmax_dispatch_spec.mlir
softmax_fused_codegen_spec.mlir
LABELS
"noasan"
"nomsan"
"notsan"
"noubsan"
"requires-gpu-nvidia"
"driver=cuda"
)

### BAZEL_TO_CMAKE_PRESERVES_ALL_CONTENT_BELOW_THIS_LINE ###
4 changes: 2 additions & 2 deletions tests/transform_dialect/cuda/reduction_codegen_spec.mlir
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
// RUN: iree-opt %s

transform.structured.canonicalized_sequence failures(propagate) {
transform.structured.canonicalized_sequence failures(suppress) {
^bb1(%variant_op: !pdl.operation):
%fill = transform.structured.match ops{["linalg.fill"]} in %variant_op

Expand All @@ -15,7 +15,7 @@ transform.structured.canonicalized_sequence failures(propagate) {
// The mapping to block ids can only happen after bufferization atm.
%foreach_thread_grid, %grid_combiner_op =
transform.iree.tile_to_foreach_thread_and_workgroup_count_region %combiner_op tile_sizes [1]
%not_combiner = transform.merge_handles %fill, %more_parallel_fill_op, %more_parallel_op
%not_combiner = transform.merge_handles %fill, %more_parallel_fill_op, %more_parallel_op : !pdl.operation
transform.structured.fuse_into_containing_op %not_combiner into %foreach_thread_grid

// Second level of tiling + fusion parallelizes to threads.
Expand Down
16 changes: 9 additions & 7 deletions tests/transform_dialect/cuda/softmax.mlir
Original file line number Diff line number Diff line change
Expand Up @@ -13,25 +13,27 @@
// RUN: iree-run-module --entry_function=max_sub_exp --device=cuda | \
// RUN: FileCheck %s

///
/// FIXME: Fused codegen must be used with the custom dispatch region formation
/// because IREE's pulls in tensor.empty by default.
/// This results in threadprivate allocations and prevents vector distribution.
///
// RUN: iree-opt %s --iree-hal-target-backends=cuda \
// RUN: --iree-abi-transformation-pipeline \
// RUN: --iree-flow-transformation-pipeline \
///
/// FIXME: This cannot be retired yet as there is some writeonly vs readwrite
/// issue and we even end up emitting out of bounds accesses.
///
// RUN: --iree-flow-dispatch-use-transform-dialect=%p/softmax_dispatch_spec.mlir \
// RUN: --iree-stream-transformation-pipeline \
// RUN: --iree-hal-configuration-pipeline | \
// RUN: iree-opt --pass-pipeline='hal.executable(hal.executable.variant(iree-llvmgpu-lower-executable-target-pass))' \
// RUN: --iree-codegen-llvmgpu-use-transform-dialect=%p/softmax_fused_codegen_spec.mlir | \
// RUN: FileCheck %s --check-prefix=CHECK-SHUFFLE

// RUN: iree-compile %s --iree-hal-target-backends=cuda \
///
/// FIXME: This cannot be retired yet as there is some writeonly vs readwrite
/// issue and we even end up emitting out of bounds accesses.
/// FIXME: Fused codegen must be used with the custom dispatch region formation
/// because IREE's pulls in tensor.empty by default.
/// This results in threadprivate allocations and prevents vector distribution.
///
// RUN: iree-compile %s --iree-hal-target-backends=cuda \
// RUN: --iree-flow-dispatch-use-transform-dialect=%p/softmax_dispatch_spec.mlir \
// RUN: --iree-codegen-llvmgpu-use-transform-dialect=%p/softmax_fused_codegen_spec.mlir | \
// RUN: iree-run-module --entry_function=max_sub_exp --device=cuda | \
Expand Down
2 changes: 1 addition & 1 deletion tests/transform_dialect/cuda/softmax_codegen_spec.mlir
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ transform.structured.canonicalized_sequence failures(propagate) {
%fill = transform.structured.match ops{["linalg.fill"]} in %variant_op
%red = transform.structured.match interface{LinalgOp}
attributes{iterator_types = ["parallel", "parallel", "reduction"]} in %variant_op
%not_root = merge_handles %fill, %red
%not_root = merge_handles %fill, %red : !pdl.operation
%foreach_thread, %tiled_generic =
transform.iree.tile_to_foreach_thread_and_workgroup_count_region %root tile_sizes [1, 4]
transform.structured.fuse_into_containing_op %not_root into %foreach_thread
Expand Down
Loading

0 comments on commit 95ed505

Please sign in to comment.