Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Better validation of gpu schedules #8068

Merged
merged 6 commits into from
Feb 7, 2024
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
212 changes: 203 additions & 9 deletions src/CanonicalizeGPUVars.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,16 +15,16 @@ using std::string;
using std::vector;

namespace {
string thread_names[] = {"__thread_id_x", "__thread_id_y", "__thread_id_z", "__thread_id_w"};
string block_names[] = {"__block_id_x", "__block_id_y", "__block_id_z", "__block_id_w"};
string thread_names[] = {"__thread_id_x", "__thread_id_y", "__thread_id_z"};
string block_names[] = {"__block_id_x", "__block_id_y", "__block_id_z"};

string get_thread_name(int index) {
internal_assert(index >= 0 && index < 4);
internal_assert(index >= 0 && index < 3);
return thread_names[index];
}

string get_block_name(int index) {
internal_assert(index >= 0 && index < 4);
internal_assert(index >= 0 && index < 3);
return block_names[index];
}

Expand Down Expand Up @@ -111,10 +111,6 @@ class CanonicalizeGPUVars : public IRMutator {

CountGPUBlocksThreads counter;
op->body.accept(&counter);
internal_assert(counter.nblocks <= 4)
<< op->name << " can only have maximum of 4 block dimensions\n";
internal_assert(counter.nthreads <= 4)
<< op->name << " can only have maximum of 4 thread dimensions\n";

if (op->for_type == ForType::GPUBlock) {
name += "." + get_block_name(counter.nblocks);
Expand All @@ -123,7 +119,6 @@ class CanonicalizeGPUVars : public IRMutator {
name += "." + get_thread_name(counter.nthreads);
debug(5) << "Replacing " << op->name << " with GPU thread name " << name << "\n";
} else if (op->for_type == ForType::GPULane) {
user_assert(counter.nlanes == 0) << "Cannot nest multiple loops over gpu lanes: " << name << "\n";
name += "." + get_thread_name(0);
}

Expand Down Expand Up @@ -190,9 +185,208 @@ class CanonicalizeGPUVars : public IRMutator {
}
};

std::string loop_nest_summary_to_node(const IRNode *root, const IRNode *target) {
class Summary : public IRVisitor {
public:
std::vector<std::ostringstream> stack;
Summary(const IRNode *target)
: target(target) {
}

protected:
const IRNode *target;
bool done = false;

using IRVisitor::visit;

void visit(const For *op) override {
if (done) {
return;
}
stack.emplace_back();
stack.back() << op->for_type << " " << op->name;
if (op == target) {
done = true;
} else {
IRVisitor::visit(op);
if (!done) {
stack.pop_back();
}
}
}

void visit(const Realize *op) override {
if (done) {
return;
}
stack.emplace_back();
stack.back() << "store_at for " << op->name;
IRVisitor::visit(op);
if (!done) {
stack.pop_back();
}
}

void visit(const HoistedStorage *op) override {
if (done) {
return;
}
stack.emplace_back();
stack.back() << "hoisted storage for " << op->name;
IRVisitor::visit(op);
if (!done) {
stack.pop_back();
}
}

void visit(const ProducerConsumer *op) override {
if (done) {
return;
}
if (op->is_producer) {
stack.emplace_back();
stack.back() << "compute_at for " << op->name;
IRVisitor::visit(op);
if (!done) {
stack.pop_back();
}
} else {
IRVisitor::visit(op);
}
}
} summary{target};

root->accept(&summary);

std::ostringstream result;
std::string prefix = "";
result << "The loop nest is:\n";
for (const auto &str : summary.stack) {
result << prefix << str.str() << ":\n";
prefix += " ";
}
return result.str();
};

// Check the user's GPU schedule is valid. Throws an error if it is not, so no
// return value required.
class ValidateGPUSchedule : public IRVisitor {

using IRVisitor::visit;

const IRNode *root = nullptr;

int in_blocks = 0;
int in_threads = 0;
int in_lanes = 0;

std::string innermost_blocks_loop, innermost_threads_loop;
std::ostringstream blocks_not_ok_reason;

void visit(const For *op) override {
if (!root) {
root = op;
}
if (in_blocks && op->for_type != ForType::GPUBlock && blocks_not_ok_reason.tellp() == 0) {
blocks_not_ok_reason << op->for_type << " loop over " << op->name;
}
if (op->for_type == ForType::GPUBlock) {
user_assert(blocks_not_ok_reason.tellp() == 0)
<< blocks_not_ok_reason.str() << " is inside GPU block loop over "
<< innermost_blocks_loop << " but outside GPU block loop over " << op->name
<< ". Funcs cannot be scheduled in between GPU block loops. "
<< loop_nest_summary_to_node(root, op);
user_assert(in_blocks < 3)
<< "GPU block loop over " << op->name << " is inside three other GPU block loops. "
<< "The maximum number of nested GPU block loops is 3. "
<< loop_nest_summary_to_node(root, op);
user_assert(in_threads == 0)
<< "GPU block loop over " << op->name << " is inside GPU thread loop over "
<< innermost_threads_loop << ". "
<< loop_nest_summary_to_node(root, op);
in_blocks++;
ScopedValue<std::string> s(innermost_blocks_loop, op->name);
IRVisitor::visit(op);
in_blocks--;
} else if (op->for_type == ForType::GPUThread) {
user_assert(in_lanes == 0)
<< "GPU thread loop over " << op->name << " is inside a loop over GPU lanes. "
<< "GPU thread loops must be outside any GPU lane loop. "
<< loop_nest_summary_to_node(root, op);
user_assert(in_threads < 3)
<< "GPU thread loop over " << op->name << " is inside three other GPU thread loops. "
<< "The maximum number of nested GPU thread loops is 3. "
<< loop_nest_summary_to_node(root, op);
user_assert(in_blocks)
<< "GPU thread loop over " << op->name << " must be inside a GPU block loop. "
<< loop_nest_summary_to_node(root, op);
in_threads++;
ScopedValue<std::string> s(innermost_threads_loop, op->name);
IRVisitor::visit(op);
in_threads--;
} else if (op->for_type == ForType::GPULane) {
user_assert(in_threads < 3)
<< "GPU lane loop over " << op->name << " is inside three other GPU thread or lane loops. "
<< "The maximum number of nested GPU thread or lane loops is 3. "
<< loop_nest_summary_to_node(root, op);
user_assert(in_lanes == 0)
<< "GPU lane loop over " << op->name << " is inside another GPU lane loop. GPU lane loops "
<< "may not be nested. "
<< loop_nest_summary_to_node(root, op);
in_lanes++;
ScopedValue<std::string> s(innermost_threads_loop, op->name);
IRVisitor::visit(op);
in_lanes--;
} else {
IRVisitor::visit(op);
}
}

void visit(const Realize *op) override {
if (!root) {
root = op;
}
if (in_blocks && blocks_not_ok_reason.tellp() == 0) {
blocks_not_ok_reason << "store_at location for " << op->name;
IRVisitor::visit(op);
blocks_not_ok_reason.clear();
} else {
IRVisitor::visit(op);
}
}

void visit(const ProducerConsumer *op) override {
if (!root) {
root = op;
}
if (op->is_producer && in_blocks && blocks_not_ok_reason.tellp() == 0) {
blocks_not_ok_reason << "compute_at location for " << op->name;
IRVisitor::visit(op);
blocks_not_ok_reason.clear();
} else {
IRVisitor::visit(op);
}
}

void visit(const HoistedStorage *op) override {
if (!root) {
root = op;
}
if (in_blocks && blocks_not_ok_reason.tellp() == 0) {
blocks_not_ok_reason << "hoist_storage location for " << op->name;
IRVisitor::visit(op);
blocks_not_ok_reason.clear();
} else {
IRVisitor::visit(op);
}
}
};

} // anonymous namespace

Stmt canonicalize_gpu_vars(Stmt s) {
ValidateGPUSchedule validator;
s.accept(&validator);
CanonicalizeGPUVars canonicalizer;
s = canonicalizer.mutate(s);
return s;
Expand Down
40 changes: 0 additions & 40 deletions src/FuseGPUThreadLoops.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1515,44 +1515,6 @@ class ZeroGPULoopMins : public IRMutator {
ZeroGPULoopMins() = default;
};

class ValidateGPULoopNesting : public IRVisitor {
int gpu_block_depth = 0, gpu_thread_depth = 0;
string innermost_block_var, innermost_thread_var;

using IRVisitor::visit;

void visit(const For *op) override {
ScopedValue<string> old_innermost_block_var(innermost_block_var);
ScopedValue<string> old_innermost_thread_var(innermost_thread_var);
ScopedValue<int> old_gpu_block_depth(gpu_block_depth);
ScopedValue<int> old_gpu_thread_depth(gpu_thread_depth);

for (int i = 1; i <= 4; i++) {
if (ends_with(op->name, block_names[4 - i])) {
user_assert(i > gpu_block_depth)
<< "Invalid schedule: Loop over " << op->name
<< " cannot be inside of loop over " << innermost_block_var << "\n";
user_assert(gpu_thread_depth == 0)
<< "Invalid schedule: Loop over " << op->name
<< " cannot be inside of loop over " << innermost_thread_var << "\n";
innermost_block_var = op->name;
gpu_block_depth = i;
}
if (ends_with(op->name, thread_names[4 - i])) {
user_assert(i > gpu_thread_depth)
<< "Invalid schedule: Loop over " << op->name
<< " cannot be inside of loop over " << innermost_thread_var << "\n";
user_assert(gpu_block_depth > 0)
<< "Invalid schedule: Loop over " << op->name
<< " must be inside a loop over gpu blocks\n";
innermost_thread_var = op->name;
gpu_thread_depth = i;
}
}
IRVisitor::visit(op);
}
};

} // namespace

// Also used by InjectImageIntrinsics
Expand Down Expand Up @@ -1632,8 +1594,6 @@ class NormalizeIfStatements : public IRMutator {
} // namespace

Stmt fuse_gpu_thread_loops(Stmt s) {
ValidateGPULoopNesting validate;
s.accept(&validate);
// NormalizeIfStatements pushes the predicates between GPU blocks
// into the innermost GPU block. FuseGPUThreadLoops would then
// merge the predicate into the merged GPU thread.
Expand Down
31 changes: 2 additions & 29 deletions src/ScheduleFunctions.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2269,6 +2269,7 @@ bool validate_schedule(Function f, const Stmt &s, const Target &target, bool is_

std::ostringstream err;

/*
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If this is meant to left here in commented-out form, it is essential you add a comment explaining why. (Otherwise, just delete it.)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oops, deleted.

// If you're compute_at() inside a gpu blocks loop, you can't have a gpu blocks loop yourself
const auto has_gpu_blocks = [&]() {
for (const Dim &d : f.definition().schedule().dims()) {
Expand All @@ -2278,40 +2279,12 @@ bool validate_schedule(Function f, const Stmt &s, const Target &target, bool is_
}
return false;
};
*/

const auto all_ok = [&]() {
return store_idx >= 0 && compute_idx >= 0 && hoist_storage_idx >= 0;
};

if (all_ok() && has_gpu_blocks()) {
for (int i = 0; i <= compute_idx; i++) {
if (sites[i].is_gpu_block) {
string site_fname = sites[i].loop_level.func();
user_error << "Functions that are compute_at() a gpu_block() loop cannot have their own gpu_block() loops, "
<< "but Func \"" << f.name() << "\" is compute_at() \"" << site_fname << "\"\n";
}
}
}

// If you're compute_at() a var marked as a gpu block var, it must be the innermost one
if (all_ok() && sites[compute_idx].is_gpu_block) {
string compute_at_fname = sites[compute_idx].loop_level.func();
int possibly_invalid_idx = compute_idx;
for (int i = compute_idx + 1; i < (int)sites.size(); i++) {
if (!sites[i].is_gpu_block) {
continue;
}
string site_fname = sites[i].loop_level.func();
if (site_fname == compute_at_fname) {
err << "Functions that are compute_at() a gpu_block() loop must specify the innermost gpu_block() loop for that Func.\n";
sites.erase(sites.begin() + possibly_invalid_idx);
// This one will also be invalid if we find a subsequent loop from the same func
possibly_invalid_idx = i;
store_idx = compute_idx = hoist_storage_idx = -1;
}
}
}

// Check there isn't a parallel loop between the compute_at and the store_at
if (all_ok()) {
for (int i = store_idx + 1; i <= compute_idx; i++) {
Expand Down
3 changes: 1 addition & 2 deletions test/correctness/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -131,8 +131,6 @@ tests(GROUPS correctness
gpu_data_flows.cpp
gpu_different_blocks_threads_dimensions.cpp
gpu_dynamic_shared.cpp
gpu_error_1.cpp
gpu_error_2.cpp
gpu_free_sync.cpp
gpu_give_input_buffers_device_allocations.cpp
gpu_jit_explicit_copy_to_device.cpp
Expand Down Expand Up @@ -186,6 +184,7 @@ tests(GROUPS correctness
interval.cpp
intrinsics.cpp
introspection.cpp
invalid_gpu_loop_nests.cpp
inverse.cpp
isnan.cpp
issue_3926.cpp
Expand Down
Loading
Loading