Skip to content

Commit

Permalink
[async] Optimize StateFlowGraph::optimize_listgen (#1894)
Browse files Browse the repository at this point in the history
* reproduce kernel profiler crashing under async mode

* fix CUDA kernel profiler

* speed up optimize listgen

* finalize

* simplify
  • Loading branch information
yuanming-hu authored Sep 26, 2020
1 parent 6ac1a54 commit f3fb6d7
Show file tree
Hide file tree
Showing 7 changed files with 62 additions and 40 deletions.
35 changes: 16 additions & 19 deletions misc/async_mgpcg.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import numpy as np
import taichi as ti
import time

real = ti.f32
ti.init(default_fp=real,
Expand All @@ -9,13 +10,12 @@
async_opt_dse=True,
async_opt_activation_demotion=True,
async_opt_fusion=True,
kernel_profiler=False
kernel_profiler=True
#, async_opt_intermediate_file="mgpcg"
)

# grid parameters
N = 128
N_gui = 512 # gui resolution

n_mg_levels = 5
pre_and_post_smoothing = 2
Expand All @@ -35,7 +35,6 @@
alpha = ti.field(dtype=real) # step size
beta = ti.field(dtype=real) # step size
sum = ti.field(dtype=real) # storage for reductions
pixels = ti.field(dtype=real, shape=(N_gui, N_gui)) # image buffer
rTr = ti.field(dtype=real, shape=())
old_zTr = ti.field(dtype=real, shape=())
new_zTr = ti.field(dtype=real, shape=())
Expand Down Expand Up @@ -146,17 +145,6 @@ def apply_preconditioner():
smooth(l, 0)


@ti.kernel
def paint():
kk = N_tot * 3 // 8
for i, j in pixels:
ii = int(i * N / N_gui) + N_ext
jj = int(j * N / N_gui) + N_ext
pixels[i, j] = x[ii, jj, kk] / N_tot


gui = ti.GUI("mgpcg", res=(N_gui, N_gui))

init()

sum[None] = 0.0
Expand Down Expand Up @@ -193,8 +181,7 @@ def update_beta():
old_zTr[None] = new_zTr[None]


# CG
for i in range(10):
def iterate():
# alpha = rTr / pTAp
compute_Ap()
reduce(p, Ap, pAp)
Expand Down Expand Up @@ -224,9 +211,19 @@ def update_beta():
# p = z + beta p
update_p()

paint()
gui.set_image(pixels)
gui.show()

def loud_sync():
t = time.time()
ti.sync()
print(f'{time.time() - t:.3f} s (compilation + execution)')


ti.sync()
for _ in range(3):
for i in range(10):
iterate()
loud_sync()

ti.kernel_profiler_print()
ti.core.print_stat()
ti.print_profile_info()
9 changes: 7 additions & 2 deletions taichi/backends/cuda/cuda_context.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -59,9 +59,14 @@ void CUDAContext::launch(void *func,
unsigned grid_dim,
unsigned block_dim,
std::size_t shared_mem_bytes) {
// It is important to keep a handle since in async mode
// a constant folding kernel may happen during a kernel launch
// then profiler->start and profiler->stop mismatch.

KernelProfilerBase::TaskHandle task_handle;
// Kernel launch
if (profiler)
profiler->start(task_name);
task_handle = profiler->start_with_handle(task_name);
auto context_guard = CUDAContext::get_instance().get_guard();

// TODO: remove usages of get_current_program here.
Expand All @@ -78,7 +83,7 @@ void CUDAContext::launch(void *func,
nullptr);
}
if (profiler)
profiler->stop();
profiler->stop(task_handle);

if (get_current_program().config.debug) {
driver.stream_synchronize(nullptr);
Expand Down
5 changes: 4 additions & 1 deletion taichi/program/async_engine.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ IRHandle IRBank::fuse(IRHandle handle_a, IRHandle handle_b, Kernel *kernel) {
return result;
}

TI_INFO("Begin uncached fusion");
TI_TRACE("Begin uncached fusion");
// We are about to change both |task_a| and |task_b|. Clone them first.
auto cloned_task_a = handle_a.clone();
auto cloned_task_b = handle_b.clone();
Expand Down Expand Up @@ -382,6 +382,7 @@ void AsyncEngine::launch(Kernel *kernel, Context &context) {
}

TaskMeta *get_task_meta(IRBank *ir_bank, const TaskLaunchRecord &t) {
TI_AUTO_PROF
// TODO: this function should ideally take only an IRNode
static std::mutex mut;

Expand Down Expand Up @@ -485,6 +486,7 @@ TaskMeta *get_task_meta(IRBank *ir_bank, const TaskLaunchRecord &t) {
}

TaskFusionMeta get_task_fusion_meta(IRBank *bank, const TaskLaunchRecord &t) {
TI_AUTO_PROF
// TODO: this function should ideally take only an IRNode
auto &fusion_meta_bank = bank->fusion_meta_bank_;
if (fusion_meta_bank.find(t.ir_handle) != fusion_meta_bank.end()) {
Expand Down Expand Up @@ -536,6 +538,7 @@ void AsyncEngine::enqueue(const TaskLaunchRecord &t) {
}

void AsyncEngine::synchronize() {
TI_AUTO_PROF
bool modified = true;
TI_TRACE("Synchronizing SFG of {} nodes", sfg->size());
debug_sfg("initial");
Expand Down
1 change: 1 addition & 0 deletions taichi/program/async_utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
TLANG_NAMESPACE_BEGIN

std::unique_ptr<IRNode> IRHandle::clone() const {
TI_AUTO_PROF
// TODO: remove get_kernel() here
return irpass::analysis::clone(const_cast<IRNode *>(ir_), ir_->get_kernel());
}
Expand Down
13 changes: 6 additions & 7 deletions taichi/program/kernel_profiler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -103,30 +103,29 @@ class DefaultProfiler : public KernelProfilerBase {
class KernelProfilerCUDA : public KernelProfilerBase {
public:
#if defined(TI_WITH_CUDA)
void *current_stop;

std::map<std::string, std::vector<std::pair<void *, void *>>>
outstanding_events;
#endif

void start(const std::string &kernel_name) override {
TaskHandle start_with_handle(const std::string &kernel_name) override {
#if defined(TI_WITH_CUDA)
void *start, *stop;
CUDADriver::get_instance().event_create(&start, CU_EVENT_DEFAULT);
CUDADriver::get_instance().event_create(&stop, CU_EVENT_DEFAULT);
CUDADriver::get_instance().event_record(start, 0);
outstanding_events[kernel_name].push_back(std::make_pair(start, stop));
current_stop = stop;
return stop;
#else
printf("CUDA Profiler not implemented;\n");
TI_NOT_IMPLEMENTED;
#endif
}

virtual void stop() override {
virtual void stop(TaskHandle handle) override {
#if defined(TI_WITH_CUDA)
CUDADriver::get_instance().event_record(current_stop, 0);
CUDADriver::get_instance().event_record(handle, 0);
#else
printf("CUDA Profiler not implemented;\n");
TI_NOT_IMPLEMENTED;
#endif
}

Expand Down
13 changes: 11 additions & 2 deletions taichi/program/kernel_profiler.h
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,9 @@ class KernelProfilerBase {
double total_time;

public:
// Needed for the CUDA backend since we need to know which task to "stop"
using TaskHandle = void *;

void clear() {
total_time = 0;
records.clear();
Expand All @@ -42,12 +45,18 @@ class KernelProfilerBase {

virtual std::string title() const = 0;

virtual void start(const std::string &kernel_name) = 0;
// TODO: remove start and always use start_with_handle
virtual void start(const std::string &kernel_name){TI_NOT_IMPLEMENTED};

virtual TaskHandle start_with_handle(const std::string &kernel_name){
TI_NOT_IMPLEMENTED};

static void profiler_start(KernelProfilerBase *profiler,
const char *kernel_name);

virtual void stop() = 0;
virtual void stop(){TI_NOT_IMPLEMENTED};

virtual void stop(TaskHandle){TI_NOT_IMPLEMENTED};

static void profiler_stop(KernelProfilerBase *profiler);

Expand Down
26 changes: 17 additions & 9 deletions taichi/program/state_flow_graph.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,7 @@ void StateFlowGraph::insert_state_flow(Node *from, Node *to, AsyncState state) {
}

bool StateFlowGraph::optimize_listgen() {
TI_AUTO_PROF
bool modified = false;

std::vector<std::pair<int, int>> common_pairs;
Expand All @@ -116,16 +117,18 @@ bool StateFlowGraph::optimize_listgen() {
for (auto &record : listgen_nodes) {
auto &listgens = record.second;

// Thanks to the dependency edges, the order of nodes in listgens seems to
// be UNIQUE
// TODO: prove
// Thanks to the dependency edges, the order of nodes in listgens is
// UNIQUE. (Consider the list state of the SNode.)

// We can only replace a continuous subset of listgens entries
// We can only replace a continuous subset of listgen entries.
// So the nested loop below is actually O(n).
for (int i = 0; i < listgens.size(); i++) {
auto node_a = listgens[i];

bool erased_any = false;

auto new_i = i;

for (int j = i + 1; j < listgens.size(); j++) {
auto node_b = listgens[j];

Expand Down Expand Up @@ -186,15 +189,12 @@ bool StateFlowGraph::optimize_listgen() {

nodes_to_delete.insert(node_b->node_id);
erased_any = true;
new_i = j;
}

if (erased_any)
break;
i = new_i;
}
}

TI_ASSERT(nodes_to_delete.size() % 2 == 0);

if (!nodes_to_delete.empty()) {
modified = true;
delete_nodes(nodes_to_delete);
Expand Down Expand Up @@ -445,6 +445,7 @@ bool StateFlowGraph::fuse() {
}

std::vector<TaskLaunchRecord> StateFlowGraph::extract(bool sort) {
TI_AUTO_PROF
if (sort)
topo_sort_nodes();
std::vector<TaskLaunchRecord> tasks;
Expand Down Expand Up @@ -673,6 +674,7 @@ std::string StateFlowGraph::dump_dot(const std::optional<std::string> &rankdir,
}

void StateFlowGraph::topo_sort_nodes() {
TI_AUTO_PROF
std::deque<std::unique_ptr<Node>> queue;
std::vector<std::unique_ptr<Node>> new_nodes;
std::vector<int> degrees_in(nodes_.size());
Expand Down Expand Up @@ -718,6 +720,7 @@ void StateFlowGraph::topo_sort_nodes() {
}

void StateFlowGraph::reid_nodes() {
TI_AUTO_PROF
for (int i = 0; i < nodes_.size(); i++) {
nodes_[i]->node_id = i;
}
Expand All @@ -727,6 +730,7 @@ void StateFlowGraph::reid_nodes() {
void StateFlowGraph::replace_reference(StateFlowGraph::Node *node_a,
StateFlowGraph::Node *node_b,
bool only_output_edges) {
TI_AUTO_PROF
// replace all edges to node A with new ones to node B
for (auto &edges : node_a->output_edges) {
// Find all nodes C that points to A
Expand Down Expand Up @@ -763,6 +767,7 @@ void StateFlowGraph::replace_reference(StateFlowGraph::Node *node_a,

void StateFlowGraph::delete_nodes(
const std::unordered_set<int> &indices_to_delete) {
TI_AUTO_PROF
std::vector<std::unique_ptr<Node>> new_nodes_;
std::unordered_set<Node *> nodes_to_delete;

Expand Down Expand Up @@ -796,6 +801,7 @@ void StateFlowGraph::delete_nodes(
}

bool StateFlowGraph::optimize_dead_store() {
TI_AUTO_PROF
bool modified = false;

for (int i = 1; i < nodes_.size(); i++) {
Expand Down Expand Up @@ -881,6 +887,7 @@ bool StateFlowGraph::optimize_dead_store() {
}

void StateFlowGraph::verify() {
TI_AUTO_PROF
const int n = nodes_.size();
reid_nodes();
for (int i = 0; i < n; i++) {
Expand Down Expand Up @@ -936,6 +943,7 @@ void StateFlowGraph::verify() {
}

bool StateFlowGraph::demote_activation() {
TI_AUTO_PROF
bool modified = false;

topo_sort_nodes();
Expand Down

0 comments on commit f3fb6d7

Please sign in to comment.