Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[async] Optimize StateFlowGraph::optimize_listgen #1894

Merged
merged 5 commits into from
Sep 26, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 16 additions & 19 deletions misc/async_mgpcg.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import numpy as np
import taichi as ti
import time

real = ti.f32
ti.init(default_fp=real,
Expand All @@ -9,13 +10,12 @@
async_opt_dse=True,
async_opt_activation_demotion=True,
async_opt_fusion=True,
kernel_profiler=False
kernel_profiler=True
#, async_opt_intermediate_file="mgpcg"
)

# grid parameters
N = 128
N_gui = 512 # gui resolution

n_mg_levels = 5
pre_and_post_smoothing = 2
Expand All @@ -35,7 +35,6 @@
alpha = ti.field(dtype=real) # step size
beta = ti.field(dtype=real) # step size
sum = ti.field(dtype=real) # storage for reductions
pixels = ti.field(dtype=real, shape=(N_gui, N_gui)) # image buffer
rTr = ti.field(dtype=real, shape=())
old_zTr = ti.field(dtype=real, shape=())
new_zTr = ti.field(dtype=real, shape=())
Expand Down Expand Up @@ -146,17 +145,6 @@ def apply_preconditioner():
smooth(l, 0)


@ti.kernel
def paint():
kk = N_tot * 3 // 8
for i, j in pixels:
ii = int(i * N / N_gui) + N_ext
jj = int(j * N / N_gui) + N_ext
pixels[i, j] = x[ii, jj, kk] / N_tot


gui = ti.GUI("mgpcg", res=(N_gui, N_gui))

init()

sum[None] = 0.0
Expand Down Expand Up @@ -193,8 +181,7 @@ def update_beta():
old_zTr[None] = new_zTr[None]


# CG
for i in range(10):
def iterate():
# alpha = rTr / pTAp
compute_Ap()
reduce(p, Ap, pAp)
Expand Down Expand Up @@ -224,9 +211,19 @@ def update_beta():
# p = z + beta p
update_p()

paint()
gui.set_image(pixels)
gui.show()

def loud_sync():
t = time.time()
ti.sync()
print(f'{time.time() - t:.3f} s (compilation + execution)')


ti.sync()
for _ in range(3):
for i in range(10):
iterate()
loud_sync()

ti.kernel_profiler_print()
ti.core.print_stat()
ti.print_profile_info()
9 changes: 7 additions & 2 deletions taichi/backends/cuda/cuda_context.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -59,9 +59,14 @@ void CUDAContext::launch(void *func,
unsigned grid_dim,
unsigned block_dim,
std::size_t shared_mem_bytes) {
// It is important to keep a handle since in async mode
// a constant folding kernel may happen during a kernel launch
// then profiler->start and profiler->stop mismatch.

KernelProfilerBase::TaskHandle task_handle;
// Kernel launch
if (profiler)
profiler->start(task_name);
task_handle = profiler->start_with_handle(task_name);
auto context_guard = CUDAContext::get_instance().get_guard();

// TODO: remove usages of get_current_program here.
Expand All @@ -78,7 +83,7 @@ void CUDAContext::launch(void *func,
nullptr);
}
if (profiler)
profiler->stop();
profiler->stop(task_handle);

if (get_current_program().config.debug) {
driver.stream_synchronize(nullptr);
Expand Down
5 changes: 4 additions & 1 deletion taichi/program/async_engine.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ IRHandle IRBank::fuse(IRHandle handle_a, IRHandle handle_b, Kernel *kernel) {
return result;
}

TI_INFO("Begin uncached fusion");
TI_TRACE("Begin uncached fusion");
// We are about to change both |task_a| and |task_b|. Clone them first.
auto cloned_task_a = handle_a.clone();
auto cloned_task_b = handle_b.clone();
Expand Down Expand Up @@ -382,6 +382,7 @@ void AsyncEngine::launch(Kernel *kernel, Context &context) {
}

TaskMeta *get_task_meta(IRBank *ir_bank, const TaskLaunchRecord &t) {
TI_AUTO_PROF
// TODO: this function should ideally take only an IRNode
static std::mutex mut;

Expand Down Expand Up @@ -485,6 +486,7 @@ TaskMeta *get_task_meta(IRBank *ir_bank, const TaskLaunchRecord &t) {
}

TaskFusionMeta get_task_fusion_meta(IRBank *bank, const TaskLaunchRecord &t) {
TI_AUTO_PROF
// TODO: this function should ideally take only an IRNode
auto &fusion_meta_bank = bank->fusion_meta_bank_;
if (fusion_meta_bank.find(t.ir_handle) != fusion_meta_bank.end()) {
Expand Down Expand Up @@ -536,6 +538,7 @@ void AsyncEngine::enqueue(const TaskLaunchRecord &t) {
}

void AsyncEngine::synchronize() {
TI_AUTO_PROF
bool modified = true;
TI_TRACE("Synchronizing SFG of {} nodes", sfg->size());
debug_sfg("initial");
Expand Down
1 change: 1 addition & 0 deletions taichi/program/async_utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
TLANG_NAMESPACE_BEGIN

std::unique_ptr<IRNode> IRHandle::clone() const {
TI_AUTO_PROF
// TODO: remove get_kernel() here
return irpass::analysis::clone(const_cast<IRNode *>(ir_), ir_->get_kernel());
}
Expand Down
13 changes: 6 additions & 7 deletions taichi/program/kernel_profiler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -103,30 +103,29 @@ class DefaultProfiler : public KernelProfilerBase {
class KernelProfilerCUDA : public KernelProfilerBase {
public:
#if defined(TI_WITH_CUDA)
void *current_stop;

std::map<std::string, std::vector<std::pair<void *, void *>>>
outstanding_events;
#endif

void start(const std::string &kernel_name) override {
TaskHandle start_with_handle(const std::string &kernel_name) override {
#if defined(TI_WITH_CUDA)
void *start, *stop;
CUDADriver::get_instance().event_create(&start, CU_EVENT_DEFAULT);
CUDADriver::get_instance().event_create(&stop, CU_EVENT_DEFAULT);
CUDADriver::get_instance().event_record(start, 0);
outstanding_events[kernel_name].push_back(std::make_pair(start, stop));
current_stop = stop;
return stop;
#else
printf("CUDA Profiler not implemented;\n");
TI_NOT_IMPLEMENTED;
#endif
}

virtual void stop() override {
virtual void stop(TaskHandle handle) override {
#if defined(TI_WITH_CUDA)
CUDADriver::get_instance().event_record(current_stop, 0);
CUDADriver::get_instance().event_record(handle, 0);
#else
printf("CUDA Profiler not implemented;\n");
TI_NOT_IMPLEMENTED;
#endif
}

Expand Down
13 changes: 11 additions & 2 deletions taichi/program/kernel_profiler.h
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,9 @@ class KernelProfilerBase {
double total_time;

public:
// Needed for the CUDA backend since we need to know which task to "stop"
using TaskHandle = void *;

void clear() {
total_time = 0;
records.clear();
Expand All @@ -42,12 +45,18 @@ class KernelProfilerBase {

virtual std::string title() const = 0;

virtual void start(const std::string &kernel_name) = 0;
// TODO: remove start and always use start_with_handle
virtual void start(const std::string &kernel_name){TI_NOT_IMPLEMENTED};

virtual TaskHandle start_with_handle(const std::string &kernel_name){
TI_NOT_IMPLEMENTED};

static void profiler_start(KernelProfilerBase *profiler,
const char *kernel_name);

virtual void stop() = 0;
virtual void stop(){TI_NOT_IMPLEMENTED};

virtual void stop(TaskHandle){TI_NOT_IMPLEMENTED};

static void profiler_stop(KernelProfilerBase *profiler);

Expand Down
26 changes: 17 additions & 9 deletions taichi/program/state_flow_graph.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,7 @@ void StateFlowGraph::insert_state_flow(Node *from, Node *to, AsyncState state) {
}

bool StateFlowGraph::optimize_listgen() {
TI_AUTO_PROF
bool modified = false;

std::vector<std::pair<int, int>> common_pairs;
Expand All @@ -116,16 +117,18 @@ bool StateFlowGraph::optimize_listgen() {
for (auto &record : listgen_nodes) {
auto &listgens = record.second;

// Thanks to the dependency edges, the order of nodes in listgens seems to
// be UNIQUE
// TODO: prove
// Thanks to the dependency edges, the order of nodes in listgens is
// UNIQUE. (Consider the list state of the SNode.)

// We can only replace a continuous subset of listgens entries
// We can only replace a continuous subset of listgen entries.
// So the nested loop below is actually O(n).
for (int i = 0; i < listgens.size(); i++) {
auto node_a = listgens[i];

bool erased_any = false;

auto new_i = i;

for (int j = i + 1; j < listgens.size(); j++) {
auto node_b = listgens[j];

Expand Down Expand Up @@ -186,15 +189,12 @@ bool StateFlowGraph::optimize_listgen() {

nodes_to_delete.insert(node_b->node_id);
erased_any = true;
new_i = j;
}

if (erased_any)
break;
i = new_i;
}
}

TI_ASSERT(nodes_to_delete.size() % 2 == 0);

if (!nodes_to_delete.empty()) {
modified = true;
delete_nodes(nodes_to_delete);
Expand Down Expand Up @@ -445,6 +445,7 @@ bool StateFlowGraph::fuse() {
}

std::vector<TaskLaunchRecord> StateFlowGraph::extract(bool sort) {
TI_AUTO_PROF
if (sort)
topo_sort_nodes();
std::vector<TaskLaunchRecord> tasks;
Expand Down Expand Up @@ -673,6 +674,7 @@ std::string StateFlowGraph::dump_dot(const std::optional<std::string> &rankdir,
}

void StateFlowGraph::topo_sort_nodes() {
TI_AUTO_PROF
std::deque<std::unique_ptr<Node>> queue;
std::vector<std::unique_ptr<Node>> new_nodes;
std::vector<int> degrees_in(nodes_.size());
Expand Down Expand Up @@ -718,6 +720,7 @@ void StateFlowGraph::topo_sort_nodes() {
}

void StateFlowGraph::reid_nodes() {
TI_AUTO_PROF
for (int i = 0; i < nodes_.size(); i++) {
nodes_[i]->node_id = i;
}
Expand All @@ -727,6 +730,7 @@ void StateFlowGraph::reid_nodes() {
void StateFlowGraph::replace_reference(StateFlowGraph::Node *node_a,
StateFlowGraph::Node *node_b,
bool only_output_edges) {
TI_AUTO_PROF
// replace all edges to node A with new ones to node B
for (auto &edges : node_a->output_edges) {
// Find all nodes C that points to A
Expand Down Expand Up @@ -763,6 +767,7 @@ void StateFlowGraph::replace_reference(StateFlowGraph::Node *node_a,

void StateFlowGraph::delete_nodes(
const std::unordered_set<int> &indices_to_delete) {
TI_AUTO_PROF
std::vector<std::unique_ptr<Node>> new_nodes_;
std::unordered_set<Node *> nodes_to_delete;

Expand Down Expand Up @@ -796,6 +801,7 @@ void StateFlowGraph::delete_nodes(
}

bool StateFlowGraph::optimize_dead_store() {
TI_AUTO_PROF
bool modified = false;

for (int i = 1; i < nodes_.size(); i++) {
Expand Down Expand Up @@ -881,6 +887,7 @@ bool StateFlowGraph::optimize_dead_store() {
}

void StateFlowGraph::verify() {
TI_AUTO_PROF
const int n = nodes_.size();
reid_nodes();
for (int i = 0; i < n; i++) {
Expand Down Expand Up @@ -936,6 +943,7 @@ void StateFlowGraph::verify() {
}

bool StateFlowGraph::demote_activation() {
TI_AUTO_PROF
bool modified = false;

topo_sort_nodes();
Expand Down