Fix transform dialect test and cleanup some usages. (#10782)

This revision cherry-picks LLVM commit d8cab3f407070c6d80396553ce024e17a0659b04 and manually resolves conflicts. This is necessary to fix a bug in the transform dialect. This allows reactivating the softmax gpu example.
iree-org · Oct 25, 2022 · 95ed505 · 95ed505
1 parent 1958c15
commit 95ed505
Show file tree

Hide file tree

Showing 10 changed files with 134 additions and 146 deletions.
diff --git a/compiler/src/iree/compiler/Codegen/Common/TransformExtensions/CommonExtensions.cpp b/compiler/src/iree/compiler/Codegen/Common/TransformExtensions/CommonExtensions.cpp
@@ -101,10 +101,10 @@ DiagnosedSilenceableFailure transform_dialect::ApplyPatternsOp::applyToOne(
     Operation *target, SmallVectorImpl<Operation *> &results,
     transform::TransformState &state) {
   if (!target->hasTrait<OpTrait::IsIsolatedFromAbove>()) {
-    target->emitOpError(
+    return mlir::emitDefiniteFailure(
+        target,
         "applies only to isolated-from-above targets because it needs to apply "
         "patterns greedily");
-    return DiagnosedSilenceableFailure(reportUnknownTransformError(target));
   }
   MLIRContext *ctx = target->getContext();
   RewritePatternSet patterns(ctx);
@@ -121,8 +121,13 @@ DiagnosedSilenceableFailure transform_dialect::ApplyPatternsOp::applyToOne(
   LogicalResult result = applyPatternsAndFoldGreedily(
       target, std::move(patterns), config, &listener);
   LogicalResult listenerResult = listener.checkErrorState();
-  if (failed(result) || failed(listenerResult))
-    return DiagnosedSilenceableFailure(reportUnknownTransformError(target));
+  if (failed(result)) {
+    return mlir::emitDefiniteFailure(target,
+                                     "greedy pattern application failed");
+  }
+  if (failed(listenerResult))
+    return mlir::emitDefiniteFailure(target, "listener tracking failed");
+
   results.assign({target});
   return DiagnosedSilenceableFailure(success());
 }
@@ -207,10 +212,10 @@ DiagnosedSilenceableFailure transform_dialect::IREEBufferizeOp::apply(
   if (payload.size() != 1 ||
       !isa<ModuleOp, HAL::ExecutableOp, HAL::ExecutableVariantOp>(
           payload.front())) {
-    state.getTopLevel()->emitOpError(
+    return mlir::emitDefiniteFailure(
+        state.getTopLevel(),
         "requires exactly a single HAL::ExecutableOp or "
         "HAL::ExecutableVariantOp target op.");
-    return DiagnosedSilenceableFailure(failure());
   }
   PassManager pm(getContext());
   // Bufferize the dispatch.
@@ -237,9 +242,14 @@ DiagnosedSilenceableFailure transform_dialect::IREEBufferizeOp::apply(
     }
     return WalkResult::advance();
   });
+
+  if (res.wasInterrupted())
+    return DiagnosedSilenceableFailure::definiteFailure();
+
   results.set(getOperation()->getOpResult(0), payload.front());
-  return DiagnosedSilenceableFailure(failure(res.wasInterrupted()));
+  return DiagnosedSilenceableFailure::success();
 }
+
 /// Populate the workgroup_count region of `dispatchOp`.
 /// For now, this only supports constant index ops and empty workload operands.
 /// Assumes the HAL::ExecutableExportOp is built with an empty region.
@@ -437,20 +447,21 @@ transform_dialect::ForeachThreadToWorkgroupOp::applyToOne(
     func::FuncOp target, SmallVectorImpl<Operation *> &results,
     transform::TransformState &state) {
   if (!isa<HAL::ExecutableOp, HAL::ExecutableVariantOp>(state.getTopLevel())) {
-    state.getTopLevel()->emitOpError(
+    return mlir::emitDefiniteFailure(
+        state.getTopLevel(),
         "requires HAL::ExecutableOp or HAL::ExecutableVariantOp toplevel "
         "to attach the workgroup size information to a nested "
         "ExecutableExportOp");
-    return DiagnosedSilenceableFailure(reportUnknownTransformError(target));
   }
 
   IREE::HAL::ExecutableExportOp exportOp;
   state.getTopLevel()->walk([&](IREE::HAL::ExecutableExportOp op) {
     if (op.getSymName() == target.getName()) exportOp = op;
   });
   if (!exportOp) {
-    state.getTopLevel()->emitOpError("no IREE::HAL::ExecutableExportOp found");
-    return DiagnosedSilenceableFailure(reportUnknownTransformError(target));
+    results.assign(1, nullptr);
+    return mlir::emitSilenceableFailure(
+        target, "no IREE::HAL::ExecutableExportOp found");
   }
 
   scf::ForeachThreadOp topLevelForeachThreadOp;
@@ -463,18 +474,19 @@ transform_dialect::ForeachThreadToWorkgroupOp::applyToOne(
   });
 
   if (walkResult.wasInterrupted()) {
-    state.getTopLevel()->emitOpError(
-        "could not find a unique topLevel scf.foreach_thread");
-    return DiagnosedSilenceableFailure(reportUnknownTransformError(target));
+    results.assign(1, nullptr);
+    return mlir::emitSilenceableFailure(
+        target, "could not find a unique topLevel scf.foreach_thread");
   }
 
   SimplePatternRewriter rewriter(topLevelForeachThreadOp);
   if (failed(rewriteForeachThreadToWorkgroup(topLevelForeachThreadOp, exportOp,
-                                             rewriter)))
-    return DiagnosedSilenceableFailure(reportUnknownTransformError(target));
+                                             rewriter))) {
+    return mlir::emitDefiniteFailure(target,
+                                     "rewriteForeachThreadToWorkgroup failed");
+  }
 
   results.assign({target});
-
   return DiagnosedSilenceableFailure(success());
 }
 
@@ -560,10 +572,7 @@ void transform_dialect::TileToForeachThreadAndWorkgroupCountRegion::getEffects(
   transform::onlyReadsHandle(getTileSizes(), effects);
   transform::onlyReadsHandle(getNumThreads(), effects);
   transform::producesHandle(getResults(), effects);
-  effects.emplace_back(MemoryEffects::Read::get(),
-                       transform::PayloadIRResource::get());
-  effects.emplace_back(MemoryEffects::Write::get(),
-                       transform::PayloadIRResource::get());
+  transform::modifiesPayload(effects);
 }
 
 DiagnosedSilenceableFailure
@@ -575,25 +584,23 @@ transform_dialect::TileToForeachThreadAndWorkgroupCountRegion::apply(
   auto funcOp = targetOps.front()->getParentOfType<func::FuncOp>();
   FailureOr<IREE::HAL::ExecutableExportOp> exportOp = getEntryPoint(funcOp);
   if (failed(exportOp)) {
-    state.getTopLevel()->emitOpError("couldn't find export op for func");
-    return DiagnosedSilenceableFailure(reportUnknownTransformError(funcOp));
+    return mlir::emitDefiniteFailure(state.getTopLevel(),
+                                     "couldn't find export op for func");
   }
 
   SmallVector<OpFoldResult> mixedTileSizes = getMixedTileSizes();
   if (mixedTileSizes.empty()) {
-    exportOp.value()->emitOpError("require tile sizes to be specified");
-    return DiagnosedSilenceableFailure(
-        reportUnknownTransformError(exportOp.value()));
+    return mlir::emitDefiniteFailure(exportOp.value(),
+                                     "require tile sizes to be specified");
   }
 
   /// Lower the workgroup count region in keeping with the way dispatch
   /// regions are created by default in IREEs compilation flow.
   IRRewriter rewriter(getContext());
   if (failed(lowerWorkgroupCountComputingRegion(rewriter, exportOp.value(),
                                                 mixedTileSizes))) {
-    exportOp.value()->emitOpError("failed to lower workgroup count region");
-    return DiagnosedSilenceableFailure(
-        reportUnknownTransformError(exportOp.value()));
+    return mlir::emitDefiniteFailure(exportOp.value(),
+                                     "failed to lower workgroup count region");
   }
 
   ArrayRef<Operation *> targets = state.getPayloadOps(getTarget());
@@ -607,11 +614,16 @@ transform_dialect::TileToForeachThreadAndWorkgroupCountRegion::apply(
       targets, getMixedNumThreads(), getMixedTileSizes(), getThreadDimMapping(),
       tileOps, tiledOps);
 
-  if (!diag.succeeded()) return diag;
+  if (!diag.succeeded()) {
+    transformResults.set(getForeachThreadOp().cast<OpResult>(),
+                         SmallVector<mlir::Operation *>{});
+    transformResults.set(getTiledOp().cast<OpResult>(),
+                         SmallVector<mlir::Operation *>{});
+    return diag;
+  }
 
   transformResults.set(getForeachThreadOp().cast<OpResult>(), tileOps);
   transformResults.set(getTiledOp().cast<OpResult>(), tiledOps);
-
   return DiagnosedSilenceableFailure(success());
 }
 

diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/TransformExtensions/LLVMGPUExtensions.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/TransformExtensions/LLVMGPUExtensions.cpp
@@ -238,12 +238,11 @@ transform_dialect::VectorToWarpExecuteOnLane0Op::applyToOne(
     scf::IfOp target, SmallVectorImpl<Operation *> &results,
     transform::TransformState &state) {
   if (!isa<HAL::ExecutableOp, HAL::ExecutableVariantOp>(state.getTopLevel())) {
-    state.getTopLevel()->emitOpError(
-        "requires HAL::ExecutableOp or HAL::ExecutableVariantOp toplevel so "
-        "that IR is properly isolated. This is required so we can safely "
-        "inspect the HAL::ExecutableExportOp under multi-threaded pass "
-        "assumptions.");
-    return DiagnosedSilenceableFailure(reportUnknownTransformError(target));
+    return emitDefaultSilenceableFailure(state.getTopLevel())
+           << "requires HAL::ExecutableOp or HAL::ExecutableVariantOp toplevel "
+              "so that IR is properly isolated. This is required so we can "
+              "safely inspect the HAL::ExecutableExportOp under multi-threaded "
+              "pass assumptions.";
   }
 
   auto halExecutableVariantOp =

diff --git a/...ernal-projects/iree-dialects/lib/Dialect/LinalgTransform/IR/StructuredTransformOpsExt.cpp b/...ernal-projects/iree-dialects/lib/Dialect/LinalgTransform/IR/StructuredTransformOpsExt.cpp
@@ -569,7 +569,7 @@ forgetUnnecessaryHandles(transform::TransformState &state,
       continue;
 
     for (Operation *payload : state.getPayloadOps(operand)) {
-      if (seen.contains(payload))
+      if (!payload || seen.contains(payload))
         continue;
       SmallVector<Value> allHandles;
       (void)state.getHandlesForPayloadOp(payload, allHandles);
@@ -592,7 +592,7 @@ forgetUnnecessaryHandles(transform::TransformState &state,
     if (!result.getUses().empty())
       continue;
     for (Operation *payload : state.getPayloadOps(result)) {
-      if (seen.contains(payload))
+      if (!payload || seen.contains(payload))
         continue;
       listener->removeMappings(payload);
       seen.insert(payload);

diff --git a/tests/transform_dialect/cuda/BUILD b/tests/transform_dialect/cuda/BUILD
@@ -7,7 +7,7 @@
 # Tests for end-to-end IREE support of entire models or their close derivatives.
 
 load("//build_tools/bazel:build_defs.oss.bzl", "iree_cmake_extra_content")
-#load("//build_tools/bazel:iree_lit_test.bzl", "iree_lit_test_suite")
+load("//build_tools/bazel:iree_lit_test.bzl", "iree_lit_test_suite")
 
 package(
     features = ["layering_check"],
@@ -23,37 +23,39 @@ endif()
     inline = True,
 )
 
-# TODO: re-enable the tests
-# iree_lit_test_suite(
-#     name = "lit",
-#     srcs = [
-#         "reduction.mlir",
-#         "softmax.mlir",
-#     ],
-#     cfg = "//tests:lit.cfg.py",
-#     # transform dialect spec files are MLIR files that specify a transformation,
-#     # they need to be included as data.
-#     data = [
-#         "reduction_codegen_spec.mlir",
-#         "softmax_codegen_spec.mlir",
-#         # FIXME: This cannot be retired yet as there is some writeonly vs readwrite
-#         # issue and we even end up emitting out of bounds accesses.
-#         "softmax_dispatch_spec.mlir",
-#         "softmax_fused_codegen_spec.mlir",
-#     ],
-#     tags = [
-#         # CUDA cuInit fails with sanitizer on.
-#         "noasan",
-#         "nomsan",
-#         "notsan",
-#         "noubsan",
-#         "requires-gpu-nvidia",
-#         "driver=cuda",
-#     ],
-#     tools = [
-#         "//tools:iree-compile",
-#         "//tools:iree-opt",
-#         "//tools:iree-run-module",
-#         "@llvm-project//llvm:FileCheck",
-#     ],
-# )
+iree_lit_test_suite(
+    name = "lit",
+    srcs = [
+        "reduction.mlir",
+        "softmax.mlir",
+    ],
+    cfg = "//tests:lit.cfg.py",
+    # transform dialect spec files are MLIR files that specify a transformation,
+    # they need to be included as data.
+    data = [
+        "reduction_codegen_spec.mlir",
+        "softmax_codegen_spec.mlir",
+        #
+        # FIXME: Fused codegen must be used with the custom dispatch region formation
+        # because IREE's pulls in tensor.empty by default.
+        # This results in threadprivate allocations and prevents vector distribution.
+        #
+        "softmax_dispatch_spec.mlir",
+        "softmax_fused_codegen_spec.mlir",
+    ],
+    tags = [
+        # CUDA cuInit fails with sanitizer on.
+        "noasan",
+        "nomsan",
+        "notsan",
+        "noubsan",
+        "requires-gpu-nvidia",
+        "driver=cuda",
+    ],
+    tools = [
+        "//tools:iree-compile",
+        "//tools:iree-opt",
+        "//tools:iree-run-module",
+        "@llvm-project//llvm:FileCheck",
+    ],
+)
diff --git a/tests/transform_dialect/cuda/CMakeLists.txt b/tests/transform_dialect/cuda/CMakeLists.txt
@@ -14,4 +14,29 @@ if(NOT IREE_HAL_DRIVER_CUDA OR NOT IREE_TARGET_BACKEND_CUDA)
   return()
 endif()
 
+iree_lit_test_suite(
+  NAME
+    lit
+  SRCS
+    "reduction.mlir"
+    "softmax.mlir"
+  TOOLS
+    FileCheck
+    iree-compile
+    iree-opt
+    iree-run-module
+  DATA
+    reduction_codegen_spec.mlir
+    softmax_codegen_spec.mlir
+    softmax_dispatch_spec.mlir
+    softmax_fused_codegen_spec.mlir
+  LABELS
+    "noasan"
+    "nomsan"
+    "notsan"
+    "noubsan"
+    "requires-gpu-nvidia"
+    "driver=cuda"
+)
+
 ### BAZEL_TO_CMAKE_PRESERVES_ALL_CONTENT_BELOW_THIS_LINE ###
diff --git a/tests/transform_dialect/cuda/reduction_codegen_spec.mlir b/tests/transform_dialect/cuda/reduction_codegen_spec.mlir
@@ -1,6 +1,6 @@
 // RUN: iree-opt %s
 
-transform.structured.canonicalized_sequence failures(propagate) {
+transform.structured.canonicalized_sequence failures(suppress) {
 ^bb1(%variant_op: !pdl.operation):
   %fill = transform.structured.match ops{["linalg.fill"]} in %variant_op
 
@@ -15,7 +15,7 @@ transform.structured.canonicalized_sequence failures(propagate) {
   // The mapping to block ids can only happen after bufferization atm.
   %foreach_thread_grid, %grid_combiner_op =
     transform.iree.tile_to_foreach_thread_and_workgroup_count_region %combiner_op tile_sizes [1]
-  %not_combiner = transform.merge_handles %fill, %more_parallel_fill_op, %more_parallel_op
+  %not_combiner = transform.merge_handles %fill, %more_parallel_fill_op, %more_parallel_op : !pdl.operation
   transform.structured.fuse_into_containing_op %not_combiner into %foreach_thread_grid
 
   // Second level of tiling + fusion parallelizes to threads.

diff --git a/tests/transform_dialect/cuda/softmax.mlir b/tests/transform_dialect/cuda/softmax.mlir
@@ -13,25 +13,27 @@
 // RUN: iree-run-module --entry_function=max_sub_exp --device=cuda | \
 // RUN: FileCheck %s
 
+///
+/// FIXME: Fused codegen must be used with the custom dispatch region formation
+/// because IREE's pulls in tensor.empty by default. 
+/// This results in threadprivate allocations and prevents vector distribution.
+///
 // RUN: iree-opt %s --iree-hal-target-backends=cuda \
 // RUN:     --iree-abi-transformation-pipeline \
 // RUN:     --iree-flow-transformation-pipeline  \
-///
-/// FIXME: This cannot be retired yet as there is some writeonly vs readwrite
-/// issue and we even end up emitting out of bounds accesses.
-///
 // RUN:     --iree-flow-dispatch-use-transform-dialect=%p/softmax_dispatch_spec.mlir \
 // RUN:     --iree-stream-transformation-pipeline \
 // RUN:     --iree-hal-configuration-pipeline | \
 // RUN: iree-opt --pass-pipeline='hal.executable(hal.executable.variant(iree-llvmgpu-lower-executable-target-pass))' \
 // RUN:     --iree-codegen-llvmgpu-use-transform-dialect=%p/softmax_fused_codegen_spec.mlir | \
 // RUN: FileCheck %s --check-prefix=CHECK-SHUFFLE
 
-// RUN: iree-compile %s --iree-hal-target-backends=cuda \
 ///
-/// FIXME: This cannot be retired yet as there is some writeonly vs readwrite
-/// issue and we even end up emitting out of bounds accesses.
+/// FIXME: Fused codegen must be used with the custom dispatch region formation
+/// because IREE's pulls in tensor.empty by default. 
+/// This results in threadprivate allocations and prevents vector distribution.
 ///
+// RUN: iree-compile %s --iree-hal-target-backends=cuda \
 // RUN:     --iree-flow-dispatch-use-transform-dialect=%p/softmax_dispatch_spec.mlir \
 // RUN:     --iree-codegen-llvmgpu-use-transform-dialect=%p/softmax_fused_codegen_spec.mlir | \
 // RUN: iree-run-module --entry_function=max_sub_exp --device=cuda | \

diff --git a/tests/transform_dialect/cuda/softmax_codegen_spec.mlir b/tests/transform_dialect/cuda/softmax_codegen_spec.mlir
@@ -10,7 +10,7 @@ transform.structured.canonicalized_sequence failures(propagate) {
   %fill = transform.structured.match ops{["linalg.fill"]} in %variant_op
   %red = transform.structured.match interface{LinalgOp}
     attributes{iterator_types = ["parallel", "parallel", "reduction"]} in %variant_op
-  %not_root = merge_handles %fill, %red
+  %not_root = merge_handles %fill, %red : !pdl.operation
   %foreach_thread, %tiled_generic =
     transform.iree.tile_to_foreach_thread_and_workgroup_count_region %root tile_sizes [1, 4]
   transform.structured.fuse_into_containing_op %not_root into %foreach_thread