From f3fb6d76393633fb463076c1b843eba1b21ad20e Mon Sep 17 00:00:00 2001 From: Yuanming Hu Date: Sat, 26 Sep 2020 01:34:16 -0400 Subject: [PATCH] [async] Optimize StateFlowGraph::optimize_listgen (#1894) * reproduce kernel profiler crashing under async mode * fix CUDA kernel profiler * speed up optimize listgen * finalize * simplify --- misc/async_mgpcg.py | 35 ++++++++++++--------------- taichi/backends/cuda/cuda_context.cpp | 9 +++++-- taichi/program/async_engine.cpp | 5 +++- taichi/program/async_utils.cpp | 1 + taichi/program/kernel_profiler.cpp | 13 +++++----- taichi/program/kernel_profiler.h | 13 ++++++++-- taichi/program/state_flow_graph.cpp | 26 +++++++++++++------- 7 files changed, 62 insertions(+), 40 deletions(-) diff --git a/misc/async_mgpcg.py b/misc/async_mgpcg.py index 2efdd0bffe969..ae7aacf8e61df 100644 --- a/misc/async_mgpcg.py +++ b/misc/async_mgpcg.py @@ -1,5 +1,6 @@ import numpy as np import taichi as ti +import time real = ti.f32 ti.init(default_fp=real, @@ -9,13 +10,12 @@ async_opt_dse=True, async_opt_activation_demotion=True, async_opt_fusion=True, - kernel_profiler=False + kernel_profiler=True #, async_opt_intermediate_file="mgpcg" ) # grid parameters N = 128 -N_gui = 512 # gui resolution n_mg_levels = 5 pre_and_post_smoothing = 2 @@ -35,7 +35,6 @@ alpha = ti.field(dtype=real) # step size beta = ti.field(dtype=real) # step size sum = ti.field(dtype=real) # storage for reductions -pixels = ti.field(dtype=real, shape=(N_gui, N_gui)) # image buffer rTr = ti.field(dtype=real, shape=()) old_zTr = ti.field(dtype=real, shape=()) new_zTr = ti.field(dtype=real, shape=()) @@ -146,17 +145,6 @@ def apply_preconditioner(): smooth(l, 0) -@ti.kernel -def paint(): - kk = N_tot * 3 // 8 - for i, j in pixels: - ii = int(i * N / N_gui) + N_ext - jj = int(j * N / N_gui) + N_ext - pixels[i, j] = x[ii, jj, kk] / N_tot - - -gui = ti.GUI("mgpcg", res=(N_gui, N_gui)) - init() sum[None] = 0.0 @@ -193,8 +181,7 @@ def update_beta(): old_zTr[None] = new_zTr[None] -# CG -for i in range(10): +def iterate(): # alpha = rTr / pTAp compute_Ap() reduce(p, Ap, pAp) @@ -224,9 +211,19 @@ def update_beta(): # p = z + beta p update_p() -paint() -gui.set_image(pixels) -gui.show() + +def loud_sync(): + t = time.time() + ti.sync() + print(f'{time.time() - t:.3f} s (compilation + execution)') + + +ti.sync() +for _ in range(3): + for i in range(10): + iterate() + loud_sync() ti.kernel_profiler_print() ti.core.print_stat() +ti.print_profile_info() diff --git a/taichi/backends/cuda/cuda_context.cpp b/taichi/backends/cuda/cuda_context.cpp index d86ba92de9246..0438414699f3c 100644 --- a/taichi/backends/cuda/cuda_context.cpp +++ b/taichi/backends/cuda/cuda_context.cpp @@ -59,9 +59,14 @@ void CUDAContext::launch(void *func, unsigned grid_dim, unsigned block_dim, std::size_t shared_mem_bytes) { + // It is important to keep a handle since in async mode + // a constant folding kernel may happen during a kernel launch + // then profiler->start and profiler->stop mismatch. + + KernelProfilerBase::TaskHandle task_handle; // Kernel launch if (profiler) - profiler->start(task_name); + task_handle = profiler->start_with_handle(task_name); auto context_guard = CUDAContext::get_instance().get_guard(); // TODO: remove usages of get_current_program here. @@ -78,7 +83,7 @@ void CUDAContext::launch(void *func, nullptr); } if (profiler) - profiler->stop(); + profiler->stop(task_handle); if (get_current_program().config.debug) { driver.stream_synchronize(nullptr); diff --git a/taichi/program/async_engine.cpp b/taichi/program/async_engine.cpp index c111de9f02e87..38f2003dafce4 100644 --- a/taichi/program/async_engine.cpp +++ b/taichi/program/async_engine.cpp @@ -75,7 +75,7 @@ IRHandle IRBank::fuse(IRHandle handle_a, IRHandle handle_b, Kernel *kernel) { return result; } - TI_INFO("Begin uncached fusion"); + TI_TRACE("Begin uncached fusion"); // We are about to change both |task_a| and |task_b|. Clone them first. auto cloned_task_a = handle_a.clone(); auto cloned_task_b = handle_b.clone(); @@ -382,6 +382,7 @@ void AsyncEngine::launch(Kernel *kernel, Context &context) { } TaskMeta *get_task_meta(IRBank *ir_bank, const TaskLaunchRecord &t) { + TI_AUTO_PROF // TODO: this function should ideally take only an IRNode static std::mutex mut; @@ -485,6 +486,7 @@ TaskMeta *get_task_meta(IRBank *ir_bank, const TaskLaunchRecord &t) { } TaskFusionMeta get_task_fusion_meta(IRBank *bank, const TaskLaunchRecord &t) { + TI_AUTO_PROF // TODO: this function should ideally take only an IRNode auto &fusion_meta_bank = bank->fusion_meta_bank_; if (fusion_meta_bank.find(t.ir_handle) != fusion_meta_bank.end()) { @@ -536,6 +538,7 @@ void AsyncEngine::enqueue(const TaskLaunchRecord &t) { } void AsyncEngine::synchronize() { + TI_AUTO_PROF bool modified = true; TI_TRACE("Synchronizing SFG of {} nodes", sfg->size()); debug_sfg("initial"); diff --git a/taichi/program/async_utils.cpp b/taichi/program/async_utils.cpp index 3112246c2925f..41677d8d95ee2 100644 --- a/taichi/program/async_utils.cpp +++ b/taichi/program/async_utils.cpp @@ -7,6 +7,7 @@ TLANG_NAMESPACE_BEGIN std::unique_ptr IRHandle::clone() const { + TI_AUTO_PROF // TODO: remove get_kernel() here return irpass::analysis::clone(const_cast(ir_), ir_->get_kernel()); } diff --git a/taichi/program/kernel_profiler.cpp b/taichi/program/kernel_profiler.cpp index 72eb2090abb09..8601457958e56 100644 --- a/taichi/program/kernel_profiler.cpp +++ b/taichi/program/kernel_profiler.cpp @@ -103,30 +103,29 @@ class DefaultProfiler : public KernelProfilerBase { class KernelProfilerCUDA : public KernelProfilerBase { public: #if defined(TI_WITH_CUDA) - void *current_stop; std::map>> outstanding_events; #endif - void start(const std::string &kernel_name) override { + TaskHandle start_with_handle(const std::string &kernel_name) override { #if defined(TI_WITH_CUDA) void *start, *stop; CUDADriver::get_instance().event_create(&start, CU_EVENT_DEFAULT); CUDADriver::get_instance().event_create(&stop, CU_EVENT_DEFAULT); CUDADriver::get_instance().event_record(start, 0); outstanding_events[kernel_name].push_back(std::make_pair(start, stop)); - current_stop = stop; + return stop; #else - printf("CUDA Profiler not implemented;\n"); + TI_NOT_IMPLEMENTED; #endif } - virtual void stop() override { + virtual void stop(TaskHandle handle) override { #if defined(TI_WITH_CUDA) - CUDADriver::get_instance().event_record(current_stop, 0); + CUDADriver::get_instance().event_record(handle, 0); #else - printf("CUDA Profiler not implemented;\n"); + TI_NOT_IMPLEMENTED; #endif } diff --git a/taichi/program/kernel_profiler.h b/taichi/program/kernel_profiler.h index b478fecdc1828..26c9fcc924340 100644 --- a/taichi/program/kernel_profiler.h +++ b/taichi/program/kernel_profiler.h @@ -33,6 +33,9 @@ class KernelProfilerBase { double total_time; public: + // Needed for the CUDA backend since we need to know which task to "stop" + using TaskHandle = void *; + void clear() { total_time = 0; records.clear(); @@ -42,12 +45,18 @@ class KernelProfilerBase { virtual std::string title() const = 0; - virtual void start(const std::string &kernel_name) = 0; + // TODO: remove start and always use start_with_handle + virtual void start(const std::string &kernel_name){TI_NOT_IMPLEMENTED}; + + virtual TaskHandle start_with_handle(const std::string &kernel_name){ + TI_NOT_IMPLEMENTED}; static void profiler_start(KernelProfilerBase *profiler, const char *kernel_name); - virtual void stop() = 0; + virtual void stop(){TI_NOT_IMPLEMENTED}; + + virtual void stop(TaskHandle){TI_NOT_IMPLEMENTED}; static void profiler_stop(KernelProfilerBase *profiler); diff --git a/taichi/program/state_flow_graph.cpp b/taichi/program/state_flow_graph.cpp index 43875c7ee8fd2..a01c7aeaaf3fc 100644 --- a/taichi/program/state_flow_graph.cpp +++ b/taichi/program/state_flow_graph.cpp @@ -96,6 +96,7 @@ void StateFlowGraph::insert_state_flow(Node *from, Node *to, AsyncState state) { } bool StateFlowGraph::optimize_listgen() { + TI_AUTO_PROF bool modified = false; std::vector> common_pairs; @@ -116,16 +117,18 @@ bool StateFlowGraph::optimize_listgen() { for (auto &record : listgen_nodes) { auto &listgens = record.second; - // Thanks to the dependency edges, the order of nodes in listgens seems to - // be UNIQUE - // TODO: prove + // Thanks to the dependency edges, the order of nodes in listgens is + // UNIQUE. (Consider the list state of the SNode.) - // We can only replace a continuous subset of listgens entries + // We can only replace a continuous subset of listgen entries. + // So the nested loop below is actually O(n). for (int i = 0; i < listgens.size(); i++) { auto node_a = listgens[i]; bool erased_any = false; + auto new_i = i; + for (int j = i + 1; j < listgens.size(); j++) { auto node_b = listgens[j]; @@ -186,15 +189,12 @@ bool StateFlowGraph::optimize_listgen() { nodes_to_delete.insert(node_b->node_id); erased_any = true; + new_i = j; } - - if (erased_any) - break; + i = new_i; } } - TI_ASSERT(nodes_to_delete.size() % 2 == 0); - if (!nodes_to_delete.empty()) { modified = true; delete_nodes(nodes_to_delete); @@ -445,6 +445,7 @@ bool StateFlowGraph::fuse() { } std::vector StateFlowGraph::extract(bool sort) { + TI_AUTO_PROF if (sort) topo_sort_nodes(); std::vector tasks; @@ -673,6 +674,7 @@ std::string StateFlowGraph::dump_dot(const std::optional &rankdir, } void StateFlowGraph::topo_sort_nodes() { + TI_AUTO_PROF std::deque> queue; std::vector> new_nodes; std::vector degrees_in(nodes_.size()); @@ -718,6 +720,7 @@ void StateFlowGraph::topo_sort_nodes() { } void StateFlowGraph::reid_nodes() { + TI_AUTO_PROF for (int i = 0; i < nodes_.size(); i++) { nodes_[i]->node_id = i; } @@ -727,6 +730,7 @@ void StateFlowGraph::reid_nodes() { void StateFlowGraph::replace_reference(StateFlowGraph::Node *node_a, StateFlowGraph::Node *node_b, bool only_output_edges) { + TI_AUTO_PROF // replace all edges to node A with new ones to node B for (auto &edges : node_a->output_edges) { // Find all nodes C that points to A @@ -763,6 +767,7 @@ void StateFlowGraph::replace_reference(StateFlowGraph::Node *node_a, void StateFlowGraph::delete_nodes( const std::unordered_set &indices_to_delete) { + TI_AUTO_PROF std::vector> new_nodes_; std::unordered_set nodes_to_delete; @@ -796,6 +801,7 @@ void StateFlowGraph::delete_nodes( } bool StateFlowGraph::optimize_dead_store() { + TI_AUTO_PROF bool modified = false; for (int i = 1; i < nodes_.size(); i++) { @@ -881,6 +887,7 @@ bool StateFlowGraph::optimize_dead_store() { } void StateFlowGraph::verify() { + TI_AUTO_PROF const int n = nodes_.size(); reid_nodes(); for (int i = 0; i < n; i++) { @@ -936,6 +943,7 @@ void StateFlowGraph::verify() { } bool StateFlowGraph::demote_activation() { + TI_AUTO_PROF bool modified = false; topo_sort_nodes();