halide · abadams · Feb 7, 2024 · Feb 5, 2024 · Feb 5, 2024 · Feb 5, 2024
diff --git a/src/CanonicalizeGPUVars.cpp b/src/CanonicalizeGPUVars.cpp
@@ -15,16 +15,16 @@ using std::string;
 using std::vector;
 
 namespace {
-string thread_names[] = {"__thread_id_x", "__thread_id_y", "__thread_id_z", "__thread_id_w"};
-string block_names[] = {"__block_id_x", "__block_id_y", "__block_id_z", "__block_id_w"};
+string thread_names[] = {"__thread_id_x", "__thread_id_y", "__thread_id_z"};
+string block_names[] = {"__block_id_x", "__block_id_y", "__block_id_z"};
 
 string get_thread_name(int index) {
-    internal_assert(index >= 0 && index < 4);
+    internal_assert(index >= 0 && index < 3);
     return thread_names[index];
 }
 
 string get_block_name(int index) {
-    internal_assert(index >= 0 && index < 4);
+    internal_assert(index >= 0 && index < 3);
     return block_names[index];
 }
 
@@ -111,10 +111,6 @@ class CanonicalizeGPUVars : public IRMutator {
 
             CountGPUBlocksThreads counter;
             op->body.accept(&counter);
-            internal_assert(counter.nblocks <= 4)
-                << op->name << " can only have maximum of 4 block dimensions\n";
-            internal_assert(counter.nthreads <= 4)
-                << op->name << " can only have maximum of 4 thread dimensions\n";
 
             if (op->for_type == ForType::GPUBlock) {
                 name += "." + get_block_name(counter.nblocks);
@@ -123,7 +119,6 @@ class CanonicalizeGPUVars : public IRMutator {
                 name += "." + get_thread_name(counter.nthreads);
                 debug(5) << "Replacing " << op->name << " with GPU thread name " << name << "\n";
             } else if (op->for_type == ForType::GPULane) {
-                user_assert(counter.nlanes == 0) << "Cannot nest multiple loops over gpu lanes: " << name << "\n";
                 name += "." + get_thread_name(0);
             }
 
@@ -190,9 +185,208 @@ class CanonicalizeGPUVars : public IRMutator {
     }
 };
 
+std::string loop_nest_summary_to_node(const IRNode *root, const IRNode *target) {
+    class Summary : public IRVisitor {
+    public:
+        std::vector<std::ostringstream> stack;
+        Summary(const IRNode *target)
+            : target(target) {
+        }
+
+    protected:
+        const IRNode *target;
+        bool done = false;
+
+        using IRVisitor::visit;
+
+        void visit(const For *op) override {
+            if (done) {
+                return;
+            }
+            stack.emplace_back();
+            stack.back() << op->for_type << " " << op->name;
+            if (op == target) {
+                done = true;
+            } else {
+                IRVisitor::visit(op);
+                if (!done) {
+                    stack.pop_back();
+                }
+            }
+        }
+
+        void visit(const Realize *op) override {
+            if (done) {
+                return;
+            }
+            stack.emplace_back();
+            stack.back() << "store_at for " << op->name;
+            IRVisitor::visit(op);
+            if (!done) {
+                stack.pop_back();
+            }
+        }
+
+        void visit(const HoistedStorage *op) override {
+            if (done) {
+                return;
+            }
+            stack.emplace_back();
+            stack.back() << "hoisted storage for " << op->name;
+            IRVisitor::visit(op);
+            if (!done) {
+                stack.pop_back();
+            }
+        }
+
+        void visit(const ProducerConsumer *op) override {
+            if (done) {
+                return;
+            }
+            if (op->is_producer) {
+                stack.emplace_back();
+                stack.back() << "compute_at for " << op->name;
+                IRVisitor::visit(op);
+                if (!done) {
+                    stack.pop_back();
+                }
+            } else {
+                IRVisitor::visit(op);
+            }
+        }
+    } summary{target};
+
+    root->accept(&summary);
+
+    std::ostringstream result;
+    std::string prefix = "";
+    result << "The loop nest is:\n";
+    for (const auto &str : summary.stack) {
+        result << prefix << str.str() << ":\n";
+        prefix += " ";
+    }
+    return result.str();
+};
+
+// Check the user's GPU schedule is valid. Throws an error if it is not, so no
+// return value required.
+class ValidateGPUSchedule : public IRVisitor {
+
+    using IRVisitor::visit;
+
+    const IRNode *root = nullptr;
+
+    int in_blocks = 0;
+    int in_threads = 0;
+    int in_lanes = 0;
+
+    std::string innermost_blocks_loop, innermost_threads_loop;
+    std::ostringstream blocks_not_ok_reason;
+
+    void visit(const For *op) override {
+        if (!root) {
+            root = op;
+        }
+        if (in_blocks && op->for_type != ForType::GPUBlock && blocks_not_ok_reason.tellp() == 0) {
+            blocks_not_ok_reason << op->for_type << " loop over " << op->name;
+        }
+        if (op->for_type == ForType::GPUBlock) {
+            user_assert(blocks_not_ok_reason.tellp() == 0)
+                << blocks_not_ok_reason.str() << " is inside GPU block loop over "
+                << innermost_blocks_loop << " but outside GPU block loop over " << op->name
+                << ". Funcs cannot be scheduled in between GPU block loops. "
+                << loop_nest_summary_to_node(root, op);
+            user_assert(in_blocks < 3)
+                << "GPU block loop over " << op->name << " is inside three other GPU block loops. "
+                << "The maximum number of nested GPU block loops is 3. "
+                << loop_nest_summary_to_node(root, op);
+            user_assert(in_threads == 0)
+                << "GPU block loop over " << op->name << " is inside GPU thread loop over "
+                << innermost_threads_loop << ". "
+                << loop_nest_summary_to_node(root, op);
+            in_blocks++;
+            ScopedValue<std::string> s(innermost_blocks_loop, op->name);
+            IRVisitor::visit(op);
+            in_blocks--;
+        } else if (op->for_type == ForType::GPUThread) {
+            user_assert(in_lanes == 0)
+                << "GPU thread loop over " << op->name << " is inside a loop over GPU lanes. "
+                << "GPU thread loops must be outside any GPU lane loop. "
+                << loop_nest_summary_to_node(root, op);
+            user_assert(in_threads < 3)
+                << "GPU thread loop over " << op->name << " is inside three other GPU thread loops. "
+                << "The maximum number of nested GPU thread loops is 3. "
+                << loop_nest_summary_to_node(root, op);
+            user_assert(in_blocks)
+                << "GPU thread loop over " << op->name << " must be inside a GPU block loop. "
+                << loop_nest_summary_to_node(root, op);
+            in_threads++;
+            ScopedValue<std::string> s(innermost_threads_loop, op->name);
+            IRVisitor::visit(op);
+            in_threads--;
+        } else if (op->for_type == ForType::GPULane) {
+            user_assert(in_threads < 3)
+                << "GPU lane loop over " << op->name << " is inside three other GPU thread or lane loops. "
+                << "The maximum number of nested GPU thread or lane loops is 3. "
+                << loop_nest_summary_to_node(root, op);
+            user_assert(in_lanes == 0)
+                << "GPU lane loop over " << op->name << " is inside another GPU lane loop. GPU lane loops "
+                << "may not be nested. "
+                << loop_nest_summary_to_node(root, op);
+            in_lanes++;
+            ScopedValue<std::string> s(innermost_threads_loop, op->name);
+            IRVisitor::visit(op);
+            in_lanes--;
+        } else {
+            IRVisitor::visit(op);
+        }
+    }
+
+    void visit(const Realize *op) override {
+        if (!root) {
+            root = op;
+        }
+        if (in_blocks && blocks_not_ok_reason.tellp() == 0) {
+            blocks_not_ok_reason << "store_at location for " << op->name;
+            IRVisitor::visit(op);
+            blocks_not_ok_reason.clear();
+        } else {
+            IRVisitor::visit(op);
+        }
+    }
+
+    void visit(const ProducerConsumer *op) override {
+        if (!root) {
+            root = op;
+        }
+        if (op->is_producer && in_blocks && blocks_not_ok_reason.tellp() == 0) {
+            blocks_not_ok_reason << "compute_at location for " << op->name;
+            IRVisitor::visit(op);
+            blocks_not_ok_reason.clear();
+        } else {
+            IRVisitor::visit(op);
+        }
+    }
+
+    void visit(const HoistedStorage *op) override {
+        if (!root) {
+            root = op;
+        }
+        if (in_blocks && blocks_not_ok_reason.tellp() == 0) {
+            blocks_not_ok_reason << "hoist_storage location for " << op->name;
+            IRVisitor::visit(op);
+            blocks_not_ok_reason.clear();
+        } else {
+            IRVisitor::visit(op);
+        }
+    }
+};
+
 }  // anonymous namespace
 
 Stmt canonicalize_gpu_vars(Stmt s) {
+    ValidateGPUSchedule validator;
+    s.accept(&validator);
     CanonicalizeGPUVars canonicalizer;
     s = canonicalizer.mutate(s);
     return s;

diff --git a/src/FuseGPUThreadLoops.cpp b/src/FuseGPUThreadLoops.cpp
@@ -1515,44 +1515,6 @@ class ZeroGPULoopMins : public IRMutator {
     ZeroGPULoopMins() = default;
 };
 
-class ValidateGPULoopNesting : public IRVisitor {
-    int gpu_block_depth = 0, gpu_thread_depth = 0;
-    string innermost_block_var, innermost_thread_var;
-
-    using IRVisitor::visit;
-
-    void visit(const For *op) override {
-        ScopedValue<string> old_innermost_block_var(innermost_block_var);
-        ScopedValue<string> old_innermost_thread_var(innermost_thread_var);
-        ScopedValue<int> old_gpu_block_depth(gpu_block_depth);
-        ScopedValue<int> old_gpu_thread_depth(gpu_thread_depth);
-
-        for (int i = 1; i <= 4; i++) {
-            if (ends_with(op->name, block_names[4 - i])) {
-                user_assert(i > gpu_block_depth)
-                    << "Invalid schedule: Loop over " << op->name
-                    << " cannot be inside of loop over " << innermost_block_var << "\n";
-                user_assert(gpu_thread_depth == 0)
-                    << "Invalid schedule: Loop over " << op->name
-                    << " cannot be inside of loop over " << innermost_thread_var << "\n";
-                innermost_block_var = op->name;
-                gpu_block_depth = i;
-            }
-            if (ends_with(op->name, thread_names[4 - i])) {
-                user_assert(i > gpu_thread_depth)
-                    << "Invalid schedule: Loop over " << op->name
-                    << " cannot be inside of loop over " << innermost_thread_var << "\n";
-                user_assert(gpu_block_depth > 0)
-                    << "Invalid schedule: Loop over " << op->name
-                    << " must be inside a loop over gpu blocks\n";
-                innermost_thread_var = op->name;
-                gpu_thread_depth = i;
-            }
-        }
-        IRVisitor::visit(op);
-    }
-};
-
 }  // namespace
 
 // Also used by InjectImageIntrinsics
@@ -1632,8 +1594,6 @@ class NormalizeIfStatements : public IRMutator {
 }  // namespace
 
 Stmt fuse_gpu_thread_loops(Stmt s) {
-    ValidateGPULoopNesting validate;
-    s.accept(&validate);
     // NormalizeIfStatements pushes the predicates between GPU blocks
     // into the innermost GPU block. FuseGPUThreadLoops would then
     // merge the predicate into the merged GPU thread.

diff --git a/src/ScheduleFunctions.cpp b/src/ScheduleFunctions.cpp
@@ -2269,6 +2269,7 @@ bool validate_schedule(Function f, const Stmt &s, const Target &target, bool is_
 
     std::ostringstream err;
 
+    /*
     // If you're compute_at() inside a gpu blocks loop, you can't have a gpu blocks loop yourself
     const auto has_gpu_blocks = [&]() {
         for (const Dim &d : f.definition().schedule().dims()) {
@@ -2278,40 +2279,12 @@ bool validate_schedule(Function f, const Stmt &s, const Target &target, bool is_
         }
         return false;
     };
+    */
 
     const auto all_ok = [&]() {
         return store_idx >= 0 && compute_idx >= 0 && hoist_storage_idx >= 0;
     };
 
-    if (all_ok() && has_gpu_blocks()) {
-        for (int i = 0; i <= compute_idx; i++) {
-            if (sites[i].is_gpu_block) {
-                string site_fname = sites[i].loop_level.func();
-                user_error << "Functions that are compute_at() a gpu_block() loop cannot have their own gpu_block() loops, "
-                           << "but Func \"" << f.name() << "\" is compute_at() \"" << site_fname << "\"\n";
-            }
-        }
-    }
-
-    // If you're compute_at() a var marked as a gpu block var, it must be the innermost one
-    if (all_ok() && sites[compute_idx].is_gpu_block) {
-        string compute_at_fname = sites[compute_idx].loop_level.func();
-        int possibly_invalid_idx = compute_idx;
-        for (int i = compute_idx + 1; i < (int)sites.size(); i++) {
-            if (!sites[i].is_gpu_block) {
-                continue;
-            }
-            string site_fname = sites[i].loop_level.func();
-            if (site_fname == compute_at_fname) {
-                err << "Functions that are compute_at() a gpu_block() loop must specify the innermost gpu_block() loop for that Func.\n";
-                sites.erase(sites.begin() + possibly_invalid_idx);
-                // This one will also be invalid if we find a subsequent loop from the same func
-                possibly_invalid_idx = i;
-                store_idx = compute_idx = hoist_storage_idx = -1;
-            }
-        }
-    }
-
     // Check there isn't a parallel loop between the compute_at and the store_at
     if (all_ok()) {
         for (int i = store_idx + 1; i <= compute_idx; i++) {

diff --git a/test/correctness/CMakeLists.txt b/test/correctness/CMakeLists.txt
@@ -131,8 +131,6 @@ tests(GROUPS correctness
       gpu_data_flows.cpp
       gpu_different_blocks_threads_dimensions.cpp
       gpu_dynamic_shared.cpp
-      gpu_error_1.cpp
-      gpu_error_2.cpp
       gpu_free_sync.cpp
       gpu_give_input_buffers_device_allocations.cpp
       gpu_jit_explicit_copy_to_device.cpp
@@ -186,6 +184,7 @@ tests(GROUPS correctness
       interval.cpp
       intrinsics.cpp
       introspection.cpp
+      invalid_gpu_loop_nests.cpp
       inverse.cpp
       isnan.cpp
       issue_3926.cpp