Better validation of gpu schedules (#8068)

* Update makefile to use test/common/terminate_handler.cpp This means we actually print error messages when using exceptions and the makefile * Better validate of GPU schedules GPU loop constraints were checked in two different places. Checking them in ScheduleFunctions was incorrect because it didn't consider update definitions and specializations. Checking them in FuseGPUThreadLoops was too late, because the Var names have gone (they've been renamed to things like __thread_id_x). Furthermore, some problems were internal errors or runtime errors when they should have been user errors. We allowed 4d thread and block dimensions, but then hit an internal error. This PR centralizes checking of GPU loop structure in CanonicalizeGPUVars and adds more helpful error messages that print the problematic loop structure. E.g: ``` Error: GPU thread loop over f$8.s0.v0 is inside three other GPU thread loops. The maximum number of nested GPU thread loops is 3. The loop nest is: compute_at for g$8: for g$8.s0.v7: for g$8.s0.v6: for g$8.s0.v5: for g$8.s0.v4: gpu_block g$8.s0.v3: gpu_block g$8.s0.v2: gpu_thread g$8.s0.v1: gpu_thread g$8.s0.v0: store_at for f$8: compute_at for f$8: gpu_thread f$8.s0.v1: gpu_thread f$8.s0.v0: ``` Fixes the bug found in #7946 * Delete dead code * Actually clear the ostringstream
halide · Feb 7, 2024 · 39e5c08 · 39e5c08
1 parent 37153a9
commit 39e5c08
Show file tree

Hide file tree

Showing 7 changed files with 317 additions and 183 deletions.
diff --git a/src/CanonicalizeGPUVars.cpp b/src/CanonicalizeGPUVars.cpp
@@ -15,16 +15,16 @@ using std::string;
 using std::vector;
 
 namespace {
-string thread_names[] = {"__thread_id_x", "__thread_id_y", "__thread_id_z", "__thread_id_w"};
-string block_names[] = {"__block_id_x", "__block_id_y", "__block_id_z", "__block_id_w"};
+string thread_names[] = {"__thread_id_x", "__thread_id_y", "__thread_id_z"};
+string block_names[] = {"__block_id_x", "__block_id_y", "__block_id_z"};
 
 string get_thread_name(int index) {
-    internal_assert(index >= 0 && index < 4);
+    internal_assert(index >= 0 && index < 3);
     return thread_names[index];
 }
 
 string get_block_name(int index) {
-    internal_assert(index >= 0 && index < 4);
+    internal_assert(index >= 0 && index < 3);
     return block_names[index];
 }
 
@@ -111,10 +111,6 @@ class CanonicalizeGPUVars : public IRMutator {
 
             CountGPUBlocksThreads counter;
             op->body.accept(&counter);
-            internal_assert(counter.nblocks <= 4)
-                << op->name << " can only have maximum of 4 block dimensions\n";
-            internal_assert(counter.nthreads <= 4)
-                << op->name << " can only have maximum of 4 thread dimensions\n";
 
             if (op->for_type == ForType::GPUBlock) {
                 name += "." + get_block_name(counter.nblocks);
@@ -123,7 +119,6 @@ class CanonicalizeGPUVars : public IRMutator {
                 name += "." + get_thread_name(counter.nthreads);
                 debug(5) << "Replacing " << op->name << " with GPU thread name " << name << "\n";
             } else if (op->for_type == ForType::GPULane) {
-                user_assert(counter.nlanes == 0) << "Cannot nest multiple loops over gpu lanes: " << name << "\n";
                 name += "." + get_thread_name(0);
             }
 
@@ -190,9 +185,218 @@ class CanonicalizeGPUVars : public IRMutator {
     }
 };
 
+std::string loop_nest_summary_to_node(const IRNode *root, const IRNode *target) {
+    class Summary : public IRVisitor {
+    public:
+        std::vector<std::ostringstream> stack;
+        Summary(const IRNode *target)
+            : target(target) {
+        }
+
+    protected:
+        const IRNode *target;
+        bool done = false;
+
+        using IRVisitor::visit;
+
+        void visit(const For *op) override {
+            if (done) {
+                return;
+            }
+            stack.emplace_back();
+            stack.back() << op->for_type << " " << op->name;
+            if (op == target) {
+                done = true;
+            } else {
+                IRVisitor::visit(op);
+                if (!done) {
+                    stack.pop_back();
+                }
+            }
+        }
+
+        void visit(const Realize *op) override {
+            if (done) {
+                return;
+            }
+            stack.emplace_back();
+            stack.back() << "store_at for " << op->name;
+            IRVisitor::visit(op);
+            if (!done) {
+                stack.pop_back();
+            }
+        }
+
+        void visit(const HoistedStorage *op) override {
+            if (done) {
+                return;
+            }
+            stack.emplace_back();
+            stack.back() << "hoisted storage for " << op->name;
+            IRVisitor::visit(op);
+            if (!done) {
+                stack.pop_back();
+            }
+        }
+
+        void visit(const ProducerConsumer *op) override {
+            if (done) {
+                return;
+            }
+            if (op->is_producer) {
+                stack.emplace_back();
+                stack.back() << "compute_at for " << op->name;
+                IRVisitor::visit(op);
+                if (!done) {
+                    stack.pop_back();
+                }
+            } else {
+                IRVisitor::visit(op);
+            }
+        }
+    } summary{target};
+
+    root->accept(&summary);
+
+    std::ostringstream result;
+    std::string prefix = "";
+    result << "The loop nest is:\n";
+    for (const auto &str : summary.stack) {
+        result << prefix << str.str() << ":\n";
+        prefix += " ";
+    }
+    return result.str();
+};
+
+// Check the user's GPU schedule is valid. Throws an error if it is not, so no
+// return value required.
+class ValidateGPUSchedule : public IRVisitor {
+
+    using IRVisitor::visit;
+
+    const IRNode *root = nullptr;
+
+    int in_blocks = 0;
+    int in_threads = 0;
+    int in_lanes = 0;
+
+    std::string innermost_blocks_loop, innermost_threads_loop;
+    std::ostringstream blocks_not_ok_reason;
+
+    void clear_blocks_not_ok_reason() {
+        std::ostringstream empty;
+        blocks_not_ok_reason.swap(empty);
+    }
+
+    void visit(const For *op) override {
+        if (!root) {
+            root = op;
+        }
+        bool should_clear = false;
+        if (in_blocks && op->for_type != ForType::GPUBlock && blocks_not_ok_reason.tellp() == 0) {
+            blocks_not_ok_reason << op->for_type << " loop over " << op->name;
+            should_clear = true;
+        }
+        if (op->for_type == ForType::GPUBlock) {
+            user_assert(blocks_not_ok_reason.tellp() == 0)
+                << blocks_not_ok_reason.str() << " is inside GPU block loop over "
+                << innermost_blocks_loop << " but outside GPU block loop over " << op->name
+                << ". Funcs cannot be scheduled in between GPU block loops. "
+                << loop_nest_summary_to_node(root, op);
+            user_assert(in_blocks < 3)
+                << "GPU block loop over " << op->name << " is inside three other GPU block loops. "
+                << "The maximum number of nested GPU block loops is 3. "
+                << loop_nest_summary_to_node(root, op);
+            user_assert(in_threads == 0)
+                << "GPU block loop over " << op->name << " is inside GPU thread loop over "
+                << innermost_threads_loop << ". "
+                << loop_nest_summary_to_node(root, op);
+            in_blocks++;
+            ScopedValue<std::string> s(innermost_blocks_loop, op->name);
+            IRVisitor::visit(op);
+            in_blocks--;
+        } else if (op->for_type == ForType::GPUThread) {
+            user_assert(in_lanes == 0)
+                << "GPU thread loop over " << op->name << " is inside a loop over GPU lanes. "
+                << "GPU thread loops must be outside any GPU lane loop. "
+                << loop_nest_summary_to_node(root, op);
+            user_assert(in_threads < 3)
+                << "GPU thread loop over " << op->name << " is inside three other GPU thread loops. "
+                << "The maximum number of nested GPU thread loops is 3. "
+                << loop_nest_summary_to_node(root, op);
+            user_assert(in_blocks)
+                << "GPU thread loop over " << op->name << " must be inside a GPU block loop. "
+                << loop_nest_summary_to_node(root, op);
+            in_threads++;
+            ScopedValue<std::string> s(innermost_threads_loop, op->name);
+            IRVisitor::visit(op);
+            in_threads--;
+        } else if (op->for_type == ForType::GPULane) {
+            user_assert(in_threads < 3)
+                << "GPU lane loop over " << op->name << " is inside three other GPU thread or lane loops. "
+                << "The maximum number of nested GPU thread or lane loops is 3. "
+                << loop_nest_summary_to_node(root, op);
+            user_assert(in_lanes == 0)
+                << "GPU lane loop over " << op->name << " is inside another GPU lane loop. GPU lane loops "
+                << "may not be nested. "
+                << loop_nest_summary_to_node(root, op);
+            in_lanes++;
+            ScopedValue<std::string> s(innermost_threads_loop, op->name);
+            IRVisitor::visit(op);
+            in_lanes--;
+        } else {
+            IRVisitor::visit(op);
+        }
+        if (should_clear) {
+            clear_blocks_not_ok_reason();
+        }
+    }
+
+    void visit(const Realize *op) override {
+        if (!root) {
+            root = op;
+        }
+        if (in_blocks && blocks_not_ok_reason.tellp() == 0) {
+            blocks_not_ok_reason << "store_at location for " << op->name;
+            IRVisitor::visit(op);
+            clear_blocks_not_ok_reason();
+        } else {
+            IRVisitor::visit(op);
+        }
+    }
+
+    void visit(const ProducerConsumer *op) override {
+        if (!root) {
+            root = op;
+        }
+        if (op->is_producer && in_blocks && blocks_not_ok_reason.tellp() == 0) {
+            blocks_not_ok_reason << "compute_at location for " << op->name;
+            IRVisitor::visit(op);
+            clear_blocks_not_ok_reason();
+        } else {
+            IRVisitor::visit(op);
+        }
+    }
+
+    void visit(const HoistedStorage *op) override {
+        if (!root) {
+            root = op;
+        }
+        if (in_blocks && blocks_not_ok_reason.tellp() == 0) {
+            blocks_not_ok_reason << "hoist_storage location for " << op->name;
+            IRVisitor::visit(op);
+            clear_blocks_not_ok_reason();
+        } else {
+            IRVisitor::visit(op);
+        }
+    }
+};
+
 }  // anonymous namespace
 
 Stmt canonicalize_gpu_vars(Stmt s) {
+    ValidateGPUSchedule validator;
+    s.accept(&validator);
     CanonicalizeGPUVars canonicalizer;
     s = canonicalizer.mutate(s);
     return s;

diff --git a/src/FuseGPUThreadLoops.cpp b/src/FuseGPUThreadLoops.cpp
@@ -1515,44 +1515,6 @@ class ZeroGPULoopMins : public IRMutator {
     ZeroGPULoopMins() = default;
 };
 
-class ValidateGPULoopNesting : public IRVisitor {
-    int gpu_block_depth = 0, gpu_thread_depth = 0;
-    string innermost_block_var, innermost_thread_var;
-
-    using IRVisitor::visit;
-
-    void visit(const For *op) override {
-        ScopedValue<string> old_innermost_block_var(innermost_block_var);
-        ScopedValue<string> old_innermost_thread_var(innermost_thread_var);
-        ScopedValue<int> old_gpu_block_depth(gpu_block_depth);
-        ScopedValue<int> old_gpu_thread_depth(gpu_thread_depth);
-
-        for (int i = 1; i <= 4; i++) {
-            if (ends_with(op->name, block_names[4 - i])) {
-                user_assert(i > gpu_block_depth)
-                    << "Invalid schedule: Loop over " << op->name
-                    << " cannot be inside of loop over " << innermost_block_var << "\n";
-                user_assert(gpu_thread_depth == 0)
-                    << "Invalid schedule: Loop over " << op->name
-                    << " cannot be inside of loop over " << innermost_thread_var << "\n";
-                innermost_block_var = op->name;
-                gpu_block_depth = i;
-            }
-            if (ends_with(op->name, thread_names[4 - i])) {
-                user_assert(i > gpu_thread_depth)
-                    << "Invalid schedule: Loop over " << op->name
-                    << " cannot be inside of loop over " << innermost_thread_var << "\n";
-                user_assert(gpu_block_depth > 0)
-                    << "Invalid schedule: Loop over " << op->name
-                    << " must be inside a loop over gpu blocks\n";
-                innermost_thread_var = op->name;
-                gpu_thread_depth = i;
-            }
-        }
-        IRVisitor::visit(op);
-    }
-};
-
 }  // namespace
 
 // Also used by InjectImageIntrinsics
@@ -1632,8 +1594,6 @@ class NormalizeIfStatements : public IRMutator {
 }  // namespace
 
 Stmt fuse_gpu_thread_loops(Stmt s) {
-    ValidateGPULoopNesting validate;
-    s.accept(&validate);
     // NormalizeIfStatements pushes the predicates between GPU blocks
     // into the innermost GPU block. FuseGPUThreadLoops would then
     // merge the predicate into the merged GPU thread.

diff --git a/src/ScheduleFunctions.cpp b/src/ScheduleFunctions.cpp
@@ -2269,49 +2269,10 @@ bool validate_schedule(Function f, const Stmt &s, const Target &target, bool is_
 
     std::ostringstream err;
 
-    // If you're compute_at() inside a gpu blocks loop, you can't have a gpu blocks loop yourself
-    const auto has_gpu_blocks = [&]() {
-        for (const Dim &d : f.definition().schedule().dims()) {
-            if (d.for_type == ForType::GPUBlock) {
-                return true;
-            }
-        }
-        return false;
-    };
-
     const auto all_ok = [&]() {
         return store_idx >= 0 && compute_idx >= 0 && hoist_storage_idx >= 0;
     };
 
-    if (all_ok() && has_gpu_blocks()) {
-        for (int i = 0; i <= compute_idx; i++) {
-            if (sites[i].is_gpu_block) {
-                string site_fname = sites[i].loop_level.func();
-                user_error << "Functions that are compute_at() a gpu_block() loop cannot have their own gpu_block() loops, "
-                           << "but Func \"" << f.name() << "\" is compute_at() \"" << site_fname << "\"\n";
-            }
-        }
-    }
-
-    // If you're compute_at() a var marked as a gpu block var, it must be the innermost one
-    if (all_ok() && sites[compute_idx].is_gpu_block) {
-        string compute_at_fname = sites[compute_idx].loop_level.func();
-        int possibly_invalid_idx = compute_idx;
-        for (int i = compute_idx + 1; i < (int)sites.size(); i++) {
-            if (!sites[i].is_gpu_block) {
-                continue;
-            }
-            string site_fname = sites[i].loop_level.func();
-            if (site_fname == compute_at_fname) {
-                err << "Functions that are compute_at() a gpu_block() loop must specify the innermost gpu_block() loop for that Func.\n";
-                sites.erase(sites.begin() + possibly_invalid_idx);
-                // This one will also be invalid if we find a subsequent loop from the same func
-                possibly_invalid_idx = i;
-                store_idx = compute_idx = hoist_storage_idx = -1;
-            }
-        }
-    }
-
     // Check there isn't a parallel loop between the compute_at and the store_at
     if (all_ok()) {
         for (int i = store_idx + 1; i <= compute_idx; i++) {

diff --git a/test/correctness/CMakeLists.txt b/test/correctness/CMakeLists.txt
@@ -132,8 +132,6 @@ tests(GROUPS correctness
       gpu_data_flows.cpp
       gpu_different_blocks_threads_dimensions.cpp
       gpu_dynamic_shared.cpp
-      gpu_error_1.cpp
-      gpu_error_2.cpp
       gpu_free_sync.cpp
       gpu_give_input_buffers_device_allocations.cpp
       gpu_jit_explicit_copy_to_device.cpp
@@ -187,6 +185,7 @@ tests(GROUPS correctness
       interval.cpp
       intrinsics.cpp
       introspection.cpp
+      invalid_gpu_loop_nests.cpp
       inverse.cpp
       isnan.cpp
       issue_3926.cpp