Skip to content

Commit

Permalink
Better validation of gpu schedules (#8068)
Browse files Browse the repository at this point in the history
* Update makefile to use test/common/terminate_handler.cpp

This means we actually print error messages when using exceptions and
the makefile

* Better validate of GPU schedules

GPU loop constraints were checked in two different places. Checking them
in ScheduleFunctions was incorrect because it didn't consider update
definitions and specializations. Checking them in FuseGPUThreadLoops was
too late, because the Var names have gone (they've been renamed to
things like __thread_id_x). Furthermore, some problems were internal
errors or runtime errors when they should have been user errors. We
allowed 4d thread and block dimensions, but then hit an internal error.

This PR centralizes checking of GPU loop structure in
CanonicalizeGPUVars and adds more helpful error messages that print the
problematic loop structure. E.g:

```
Error: GPU thread loop over f$8.s0.v0 is inside three other GPU thread
loops. The maximum number of nested GPU thread loops is 3. The loop nest
is:
compute_at for g$8:
 for g$8.s0.v7:
  for g$8.s0.v6:
   for g$8.s0.v5:
    for g$8.s0.v4:
     gpu_block g$8.s0.v3:
      gpu_block g$8.s0.v2:
       gpu_thread g$8.s0.v1:
        gpu_thread g$8.s0.v0:
         store_at for f$8:
          compute_at for f$8:
           gpu_thread f$8.s0.v1:
            gpu_thread f$8.s0.v0:
```

Fixes the bug found in #7946

* Delete dead code

* Actually clear the ostringstream
  • Loading branch information
abadams authored Feb 7, 2024
1 parent 37153a9 commit 39e5c08
Show file tree
Hide file tree
Showing 7 changed files with 317 additions and 183 deletions.
222 changes: 213 additions & 9 deletions src/CanonicalizeGPUVars.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,16 +15,16 @@ using std::string;
using std::vector;

namespace {
string thread_names[] = {"__thread_id_x", "__thread_id_y", "__thread_id_z", "__thread_id_w"};
string block_names[] = {"__block_id_x", "__block_id_y", "__block_id_z", "__block_id_w"};
string thread_names[] = {"__thread_id_x", "__thread_id_y", "__thread_id_z"};
string block_names[] = {"__block_id_x", "__block_id_y", "__block_id_z"};

string get_thread_name(int index) {
internal_assert(index >= 0 && index < 4);
internal_assert(index >= 0 && index < 3);
return thread_names[index];
}

string get_block_name(int index) {
internal_assert(index >= 0 && index < 4);
internal_assert(index >= 0 && index < 3);
return block_names[index];
}

Expand Down Expand Up @@ -111,10 +111,6 @@ class CanonicalizeGPUVars : public IRMutator {

CountGPUBlocksThreads counter;
op->body.accept(&counter);
internal_assert(counter.nblocks <= 4)
<< op->name << " can only have maximum of 4 block dimensions\n";
internal_assert(counter.nthreads <= 4)
<< op->name << " can only have maximum of 4 thread dimensions\n";

if (op->for_type == ForType::GPUBlock) {
name += "." + get_block_name(counter.nblocks);
Expand All @@ -123,7 +119,6 @@ class CanonicalizeGPUVars : public IRMutator {
name += "." + get_thread_name(counter.nthreads);
debug(5) << "Replacing " << op->name << " with GPU thread name " << name << "\n";
} else if (op->for_type == ForType::GPULane) {
user_assert(counter.nlanes == 0) << "Cannot nest multiple loops over gpu lanes: " << name << "\n";
name += "." + get_thread_name(0);
}

Expand Down Expand Up @@ -190,9 +185,218 @@ class CanonicalizeGPUVars : public IRMutator {
}
};

std::string loop_nest_summary_to_node(const IRNode *root, const IRNode *target) {
class Summary : public IRVisitor {
public:
std::vector<std::ostringstream> stack;
Summary(const IRNode *target)
: target(target) {
}

protected:
const IRNode *target;
bool done = false;

using IRVisitor::visit;

void visit(const For *op) override {
if (done) {
return;
}
stack.emplace_back();
stack.back() << op->for_type << " " << op->name;
if (op == target) {
done = true;
} else {
IRVisitor::visit(op);
if (!done) {
stack.pop_back();
}
}
}

void visit(const Realize *op) override {
if (done) {
return;
}
stack.emplace_back();
stack.back() << "store_at for " << op->name;
IRVisitor::visit(op);
if (!done) {
stack.pop_back();
}
}

void visit(const HoistedStorage *op) override {
if (done) {
return;
}
stack.emplace_back();
stack.back() << "hoisted storage for " << op->name;
IRVisitor::visit(op);
if (!done) {
stack.pop_back();
}
}

void visit(const ProducerConsumer *op) override {
if (done) {
return;
}
if (op->is_producer) {
stack.emplace_back();
stack.back() << "compute_at for " << op->name;
IRVisitor::visit(op);
if (!done) {
stack.pop_back();
}
} else {
IRVisitor::visit(op);
}
}
} summary{target};

root->accept(&summary);

std::ostringstream result;
std::string prefix = "";
result << "The loop nest is:\n";
for (const auto &str : summary.stack) {
result << prefix << str.str() << ":\n";
prefix += " ";
}
return result.str();
};

// Check the user's GPU schedule is valid. Throws an error if it is not, so no
// return value required.
class ValidateGPUSchedule : public IRVisitor {

using IRVisitor::visit;

const IRNode *root = nullptr;

int in_blocks = 0;
int in_threads = 0;
int in_lanes = 0;

std::string innermost_blocks_loop, innermost_threads_loop;
std::ostringstream blocks_not_ok_reason;

void clear_blocks_not_ok_reason() {
std::ostringstream empty;
blocks_not_ok_reason.swap(empty);
}

void visit(const For *op) override {
if (!root) {
root = op;
}
bool should_clear = false;
if (in_blocks && op->for_type != ForType::GPUBlock && blocks_not_ok_reason.tellp() == 0) {
blocks_not_ok_reason << op->for_type << " loop over " << op->name;
should_clear = true;
}
if (op->for_type == ForType::GPUBlock) {
user_assert(blocks_not_ok_reason.tellp() == 0)
<< blocks_not_ok_reason.str() << " is inside GPU block loop over "
<< innermost_blocks_loop << " but outside GPU block loop over " << op->name
<< ". Funcs cannot be scheduled in between GPU block loops. "
<< loop_nest_summary_to_node(root, op);
user_assert(in_blocks < 3)
<< "GPU block loop over " << op->name << " is inside three other GPU block loops. "
<< "The maximum number of nested GPU block loops is 3. "
<< loop_nest_summary_to_node(root, op);
user_assert(in_threads == 0)
<< "GPU block loop over " << op->name << " is inside GPU thread loop over "
<< innermost_threads_loop << ". "
<< loop_nest_summary_to_node(root, op);
in_blocks++;
ScopedValue<std::string> s(innermost_blocks_loop, op->name);
IRVisitor::visit(op);
in_blocks--;
} else if (op->for_type == ForType::GPUThread) {
user_assert(in_lanes == 0)
<< "GPU thread loop over " << op->name << " is inside a loop over GPU lanes. "
<< "GPU thread loops must be outside any GPU lane loop. "
<< loop_nest_summary_to_node(root, op);
user_assert(in_threads < 3)
<< "GPU thread loop over " << op->name << " is inside three other GPU thread loops. "
<< "The maximum number of nested GPU thread loops is 3. "
<< loop_nest_summary_to_node(root, op);
user_assert(in_blocks)
<< "GPU thread loop over " << op->name << " must be inside a GPU block loop. "
<< loop_nest_summary_to_node(root, op);
in_threads++;
ScopedValue<std::string> s(innermost_threads_loop, op->name);
IRVisitor::visit(op);
in_threads--;
} else if (op->for_type == ForType::GPULane) {
user_assert(in_threads < 3)
<< "GPU lane loop over " << op->name << " is inside three other GPU thread or lane loops. "
<< "The maximum number of nested GPU thread or lane loops is 3. "
<< loop_nest_summary_to_node(root, op);
user_assert(in_lanes == 0)
<< "GPU lane loop over " << op->name << " is inside another GPU lane loop. GPU lane loops "
<< "may not be nested. "
<< loop_nest_summary_to_node(root, op);
in_lanes++;
ScopedValue<std::string> s(innermost_threads_loop, op->name);
IRVisitor::visit(op);
in_lanes--;
} else {
IRVisitor::visit(op);
}
if (should_clear) {
clear_blocks_not_ok_reason();
}
}

void visit(const Realize *op) override {
if (!root) {
root = op;
}
if (in_blocks && blocks_not_ok_reason.tellp() == 0) {
blocks_not_ok_reason << "store_at location for " << op->name;
IRVisitor::visit(op);
clear_blocks_not_ok_reason();
} else {
IRVisitor::visit(op);
}
}

void visit(const ProducerConsumer *op) override {
if (!root) {
root = op;
}
if (op->is_producer && in_blocks && blocks_not_ok_reason.tellp() == 0) {
blocks_not_ok_reason << "compute_at location for " << op->name;
IRVisitor::visit(op);
clear_blocks_not_ok_reason();
} else {
IRVisitor::visit(op);
}
}

void visit(const HoistedStorage *op) override {
if (!root) {
root = op;
}
if (in_blocks && blocks_not_ok_reason.tellp() == 0) {
blocks_not_ok_reason << "hoist_storage location for " << op->name;
IRVisitor::visit(op);
clear_blocks_not_ok_reason();
} else {
IRVisitor::visit(op);
}
}
};

} // anonymous namespace

Stmt canonicalize_gpu_vars(Stmt s) {
ValidateGPUSchedule validator;
s.accept(&validator);
CanonicalizeGPUVars canonicalizer;
s = canonicalizer.mutate(s);
return s;
Expand Down
40 changes: 0 additions & 40 deletions src/FuseGPUThreadLoops.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1515,44 +1515,6 @@ class ZeroGPULoopMins : public IRMutator {
ZeroGPULoopMins() = default;
};

class ValidateGPULoopNesting : public IRVisitor {
int gpu_block_depth = 0, gpu_thread_depth = 0;
string innermost_block_var, innermost_thread_var;

using IRVisitor::visit;

void visit(const For *op) override {
ScopedValue<string> old_innermost_block_var(innermost_block_var);
ScopedValue<string> old_innermost_thread_var(innermost_thread_var);
ScopedValue<int> old_gpu_block_depth(gpu_block_depth);
ScopedValue<int> old_gpu_thread_depth(gpu_thread_depth);

for (int i = 1; i <= 4; i++) {
if (ends_with(op->name, block_names[4 - i])) {
user_assert(i > gpu_block_depth)
<< "Invalid schedule: Loop over " << op->name
<< " cannot be inside of loop over " << innermost_block_var << "\n";
user_assert(gpu_thread_depth == 0)
<< "Invalid schedule: Loop over " << op->name
<< " cannot be inside of loop over " << innermost_thread_var << "\n";
innermost_block_var = op->name;
gpu_block_depth = i;
}
if (ends_with(op->name, thread_names[4 - i])) {
user_assert(i > gpu_thread_depth)
<< "Invalid schedule: Loop over " << op->name
<< " cannot be inside of loop over " << innermost_thread_var << "\n";
user_assert(gpu_block_depth > 0)
<< "Invalid schedule: Loop over " << op->name
<< " must be inside a loop over gpu blocks\n";
innermost_thread_var = op->name;
gpu_thread_depth = i;
}
}
IRVisitor::visit(op);
}
};

} // namespace

// Also used by InjectImageIntrinsics
Expand Down Expand Up @@ -1632,8 +1594,6 @@ class NormalizeIfStatements : public IRMutator {
} // namespace

Stmt fuse_gpu_thread_loops(Stmt s) {
ValidateGPULoopNesting validate;
s.accept(&validate);
// NormalizeIfStatements pushes the predicates between GPU blocks
// into the innermost GPU block. FuseGPUThreadLoops would then
// merge the predicate into the merged GPU thread.
Expand Down
39 changes: 0 additions & 39 deletions src/ScheduleFunctions.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2269,49 +2269,10 @@ bool validate_schedule(Function f, const Stmt &s, const Target &target, bool is_

std::ostringstream err;

// If you're compute_at() inside a gpu blocks loop, you can't have a gpu blocks loop yourself
const auto has_gpu_blocks = [&]() {
for (const Dim &d : f.definition().schedule().dims()) {
if (d.for_type == ForType::GPUBlock) {
return true;
}
}
return false;
};

const auto all_ok = [&]() {
return store_idx >= 0 && compute_idx >= 0 && hoist_storage_idx >= 0;
};

if (all_ok() && has_gpu_blocks()) {
for (int i = 0; i <= compute_idx; i++) {
if (sites[i].is_gpu_block) {
string site_fname = sites[i].loop_level.func();
user_error << "Functions that are compute_at() a gpu_block() loop cannot have their own gpu_block() loops, "
<< "but Func \"" << f.name() << "\" is compute_at() \"" << site_fname << "\"\n";
}
}
}

// If you're compute_at() a var marked as a gpu block var, it must be the innermost one
if (all_ok() && sites[compute_idx].is_gpu_block) {
string compute_at_fname = sites[compute_idx].loop_level.func();
int possibly_invalid_idx = compute_idx;
for (int i = compute_idx + 1; i < (int)sites.size(); i++) {
if (!sites[i].is_gpu_block) {
continue;
}
string site_fname = sites[i].loop_level.func();
if (site_fname == compute_at_fname) {
err << "Functions that are compute_at() a gpu_block() loop must specify the innermost gpu_block() loop for that Func.\n";
sites.erase(sites.begin() + possibly_invalid_idx);
// This one will also be invalid if we find a subsequent loop from the same func
possibly_invalid_idx = i;
store_idx = compute_idx = hoist_storage_idx = -1;
}
}
}

// Check there isn't a parallel loop between the compute_at and the store_at
if (all_ok()) {
for (int i = store_idx + 1; i <= compute_idx; i++) {
Expand Down
3 changes: 1 addition & 2 deletions test/correctness/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -132,8 +132,6 @@ tests(GROUPS correctness
gpu_data_flows.cpp
gpu_different_blocks_threads_dimensions.cpp
gpu_dynamic_shared.cpp
gpu_error_1.cpp
gpu_error_2.cpp
gpu_free_sync.cpp
gpu_give_input_buffers_device_allocations.cpp
gpu_jit_explicit_copy_to_device.cpp
Expand Down Expand Up @@ -187,6 +185,7 @@ tests(GROUPS correctness
interval.cpp
intrinsics.cpp
introspection.cpp
invalid_gpu_loop_nests.cpp
inverse.cpp
isnan.cpp
issue_3926.cpp
Expand Down
Loading

0 comments on commit 39e5c08

Please sign in to comment.