diff --git a/misc/async_mgpcg.py b/misc/async_mgpcg.py
index 2efdd0bffe969..ae7aacf8e61df 100644
--- a/misc/async_mgpcg.py
+++ b/misc/async_mgpcg.py
@@ -1,5 +1,6 @@
 import numpy as np
 import taichi as ti
+import time
 
 real = ti.f32
 ti.init(default_fp=real,
@@ -9,13 +10,12 @@
         async_opt_dse=True,
         async_opt_activation_demotion=True,
         async_opt_fusion=True,
-        kernel_profiler=False
+        kernel_profiler=True
         #, async_opt_intermediate_file="mgpcg"
         )
 
 # grid parameters
 N = 128
-N_gui = 512  # gui resolution
 
 n_mg_levels = 5
 pre_and_post_smoothing = 2
@@ -35,7 +35,6 @@
 alpha = ti.field(dtype=real)  # step size
 beta = ti.field(dtype=real)  # step size
 sum = ti.field(dtype=real)  # storage for reductions
-pixels = ti.field(dtype=real, shape=(N_gui, N_gui))  # image buffer
 rTr = ti.field(dtype=real, shape=())
 old_zTr = ti.field(dtype=real, shape=())
 new_zTr = ti.field(dtype=real, shape=())
@@ -146,17 +145,6 @@ def apply_preconditioner():
             smooth(l, 0)
 
 
-@ti.kernel
-def paint():
-    kk = N_tot * 3 // 8
-    for i, j in pixels:
-        ii = int(i * N / N_gui) + N_ext
-        jj = int(j * N / N_gui) + N_ext
-        pixels[i, j] = x[ii, jj, kk] / N_tot
-
-
-gui = ti.GUI("mgpcg", res=(N_gui, N_gui))
-
 init()
 
 sum[None] = 0.0
@@ -193,8 +181,7 @@ def update_beta():
     old_zTr[None] = new_zTr[None]
 
 
-# CG
-for i in range(10):
+def iterate():
     # alpha = rTr / pTAp
     compute_Ap()
     reduce(p, Ap, pAp)
@@ -224,9 +211,19 @@ def update_beta():
     # p = z + beta p
     update_p()
 
-paint()
-gui.set_image(pixels)
-gui.show()
+
+def loud_sync():
+    t = time.time()
+    ti.sync()
+    print(f'{time.time() - t:.3f} s (compilation + execution)')
+
+
+ti.sync()
+for _ in range(3):
+    for i in range(10):
+        iterate()
+    loud_sync()
 
 ti.kernel_profiler_print()
 ti.core.print_stat()
+ti.print_profile_info()
diff --git a/taichi/backends/cuda/cuda_context.cpp b/taichi/backends/cuda/cuda_context.cpp
index d86ba92de9246..0438414699f3c 100644
--- a/taichi/backends/cuda/cuda_context.cpp
+++ b/taichi/backends/cuda/cuda_context.cpp
@@ -59,9 +59,14 @@ void CUDAContext::launch(void *func,
                          unsigned grid_dim,
                          unsigned block_dim,
                          std::size_t shared_mem_bytes) {
+  // It is important to keep a handle since in async mode
+  // a constant folding kernel may happen during a kernel launch
+  // then profiler->start and profiler->stop mismatch.
+
+  KernelProfilerBase::TaskHandle task_handle;
   // Kernel launch
   if (profiler)
-    profiler->start(task_name);
+    task_handle = profiler->start_with_handle(task_name);
   auto context_guard = CUDAContext::get_instance().get_guard();
 
   // TODO: remove usages of get_current_program here.
@@ -78,7 +83,7 @@ void CUDAContext::launch(void *func,
                          nullptr);
   }
   if (profiler)
-    profiler->stop();
+    profiler->stop(task_handle);
 
   if (get_current_program().config.debug) {
     driver.stream_synchronize(nullptr);
diff --git a/taichi/program/async_engine.cpp b/taichi/program/async_engine.cpp
index c111de9f02e87..38f2003dafce4 100644
--- a/taichi/program/async_engine.cpp
+++ b/taichi/program/async_engine.cpp
@@ -75,7 +75,7 @@ IRHandle IRBank::fuse(IRHandle handle_a, IRHandle handle_b, Kernel *kernel) {
     return result;
   }
 
-  TI_INFO("Begin uncached fusion");
+  TI_TRACE("Begin uncached fusion");
   // We are about to change both |task_a| and |task_b|. Clone them first.
   auto cloned_task_a = handle_a.clone();
   auto cloned_task_b = handle_b.clone();
@@ -382,6 +382,7 @@ void AsyncEngine::launch(Kernel *kernel, Context &context) {
 }
 
 TaskMeta *get_task_meta(IRBank *ir_bank, const TaskLaunchRecord &t) {
+  TI_AUTO_PROF
   // TODO: this function should ideally take only an IRNode
   static std::mutex mut;
 
@@ -485,6 +486,7 @@ TaskMeta *get_task_meta(IRBank *ir_bank, const TaskLaunchRecord &t) {
 }
 
 TaskFusionMeta get_task_fusion_meta(IRBank *bank, const TaskLaunchRecord &t) {
+  TI_AUTO_PROF
   // TODO: this function should ideally take only an IRNode
   auto &fusion_meta_bank = bank->fusion_meta_bank_;
   if (fusion_meta_bank.find(t.ir_handle) != fusion_meta_bank.end()) {
@@ -536,6 +538,7 @@ void AsyncEngine::enqueue(const TaskLaunchRecord &t) {
 }
 
 void AsyncEngine::synchronize() {
+  TI_AUTO_PROF
   bool modified = true;
   TI_TRACE("Synchronizing SFG of {} nodes", sfg->size());
   debug_sfg("initial");
diff --git a/taichi/program/async_utils.cpp b/taichi/program/async_utils.cpp
index 3112246c2925f..41677d8d95ee2 100644
--- a/taichi/program/async_utils.cpp
+++ b/taichi/program/async_utils.cpp
@@ -7,6 +7,7 @@
 TLANG_NAMESPACE_BEGIN
 
 std::unique_ptr<IRNode> IRHandle::clone() const {
+  TI_AUTO_PROF
   // TODO: remove get_kernel() here
   return irpass::analysis::clone(const_cast<IRNode *>(ir_), ir_->get_kernel());
 }
diff --git a/taichi/program/kernel_profiler.cpp b/taichi/program/kernel_profiler.cpp
index 72eb2090abb09..8601457958e56 100644
--- a/taichi/program/kernel_profiler.cpp
+++ b/taichi/program/kernel_profiler.cpp
@@ -103,30 +103,29 @@ class DefaultProfiler : public KernelProfilerBase {
 class KernelProfilerCUDA : public KernelProfilerBase {
  public:
 #if defined(TI_WITH_CUDA)
-  void *current_stop;
 
   std::map<std::string, std::vector<std::pair<void *, void *>>>
       outstanding_events;
 #endif
 
-  void start(const std::string &kernel_name) override {
+  TaskHandle start_with_handle(const std::string &kernel_name) override {
 #if defined(TI_WITH_CUDA)
     void *start, *stop;
     CUDADriver::get_instance().event_create(&start, CU_EVENT_DEFAULT);
     CUDADriver::get_instance().event_create(&stop, CU_EVENT_DEFAULT);
     CUDADriver::get_instance().event_record(start, 0);
     outstanding_events[kernel_name].push_back(std::make_pair(start, stop));
-    current_stop = stop;
+    return stop;
 #else
-    printf("CUDA Profiler not implemented;\n");
+    TI_NOT_IMPLEMENTED;
 #endif
   }
 
-  virtual void stop() override {
+  virtual void stop(TaskHandle handle) override {
 #if defined(TI_WITH_CUDA)
-    CUDADriver::get_instance().event_record(current_stop, 0);
+    CUDADriver::get_instance().event_record(handle, 0);
 #else
-    printf("CUDA Profiler not implemented;\n");
+    TI_NOT_IMPLEMENTED;
 #endif
   }
 
diff --git a/taichi/program/kernel_profiler.h b/taichi/program/kernel_profiler.h
index b478fecdc1828..26c9fcc924340 100644
--- a/taichi/program/kernel_profiler.h
+++ b/taichi/program/kernel_profiler.h
@@ -33,6 +33,9 @@ class KernelProfilerBase {
   double total_time;
 
  public:
+  // Needed for the CUDA backend since we need to know which task to "stop"
+  using TaskHandle = void *;
+
   void clear() {
     total_time = 0;
     records.clear();
@@ -42,12 +45,18 @@ class KernelProfilerBase {
 
   virtual std::string title() const = 0;
 
-  virtual void start(const std::string &kernel_name) = 0;
+  // TODO: remove start and always use start_with_handle
+  virtual void start(const std::string &kernel_name){TI_NOT_IMPLEMENTED};
+
+  virtual TaskHandle start_with_handle(const std::string &kernel_name){
+      TI_NOT_IMPLEMENTED};
 
   static void profiler_start(KernelProfilerBase *profiler,
                              const char *kernel_name);
 
-  virtual void stop() = 0;
+  virtual void stop(){TI_NOT_IMPLEMENTED};
+
+  virtual void stop(TaskHandle){TI_NOT_IMPLEMENTED};
 
   static void profiler_stop(KernelProfilerBase *profiler);
 
diff --git a/taichi/program/state_flow_graph.cpp b/taichi/program/state_flow_graph.cpp
index 43875c7ee8fd2..a01c7aeaaf3fc 100644
--- a/taichi/program/state_flow_graph.cpp
+++ b/taichi/program/state_flow_graph.cpp
@@ -96,6 +96,7 @@ void StateFlowGraph::insert_state_flow(Node *from, Node *to, AsyncState state) {
 }
 
 bool StateFlowGraph::optimize_listgen() {
+  TI_AUTO_PROF
   bool modified = false;
 
   std::vector<std::pair<int, int>> common_pairs;
@@ -116,16 +117,18 @@ bool StateFlowGraph::optimize_listgen() {
   for (auto &record : listgen_nodes) {
     auto &listgens = record.second;
 
-    // Thanks to the dependency edges, the order of nodes in listgens seems to
-    // be UNIQUE
-    // TODO: prove
+    // Thanks to the dependency edges, the order of nodes in listgens is
+    // UNIQUE. (Consider the list state of the SNode.)
 
-    // We can only replace a continuous subset of listgens entries
+    // We can only replace a continuous subset of listgen entries.
+    // So the nested loop below is actually O(n).
     for (int i = 0; i < listgens.size(); i++) {
       auto node_a = listgens[i];
 
       bool erased_any = false;
 
+      auto new_i = i;
+
       for (int j = i + 1; j < listgens.size(); j++) {
         auto node_b = listgens[j];
 
@@ -186,15 +189,12 @@ bool StateFlowGraph::optimize_listgen() {
 
         nodes_to_delete.insert(node_b->node_id);
         erased_any = true;
+        new_i = j;
       }
-
-      if (erased_any)
-        break;
+      i = new_i;
     }
   }
 
-  TI_ASSERT(nodes_to_delete.size() % 2 == 0);
-
   if (!nodes_to_delete.empty()) {
     modified = true;
     delete_nodes(nodes_to_delete);
@@ -445,6 +445,7 @@ bool StateFlowGraph::fuse() {
 }
 
 std::vector<TaskLaunchRecord> StateFlowGraph::extract(bool sort) {
+  TI_AUTO_PROF
   if (sort)
     topo_sort_nodes();
   std::vector<TaskLaunchRecord> tasks;
@@ -673,6 +674,7 @@ std::string StateFlowGraph::dump_dot(const std::optional<std::string> &rankdir,
 }
 
 void StateFlowGraph::topo_sort_nodes() {
+  TI_AUTO_PROF
   std::deque<std::unique_ptr<Node>> queue;
   std::vector<std::unique_ptr<Node>> new_nodes;
   std::vector<int> degrees_in(nodes_.size());
@@ -718,6 +720,7 @@ void StateFlowGraph::topo_sort_nodes() {
 }
 
 void StateFlowGraph::reid_nodes() {
+  TI_AUTO_PROF
   for (int i = 0; i < nodes_.size(); i++) {
     nodes_[i]->node_id = i;
   }
@@ -727,6 +730,7 @@ void StateFlowGraph::reid_nodes() {
 void StateFlowGraph::replace_reference(StateFlowGraph::Node *node_a,
                                        StateFlowGraph::Node *node_b,
                                        bool only_output_edges) {
+  TI_AUTO_PROF
   // replace all edges to node A with new ones to node B
   for (auto &edges : node_a->output_edges) {
     // Find all nodes C that points to A
@@ -763,6 +767,7 @@ void StateFlowGraph::replace_reference(StateFlowGraph::Node *node_a,
 
 void StateFlowGraph::delete_nodes(
     const std::unordered_set<int> &indices_to_delete) {
+  TI_AUTO_PROF
   std::vector<std::unique_ptr<Node>> new_nodes_;
   std::unordered_set<Node *> nodes_to_delete;
 
@@ -796,6 +801,7 @@ void StateFlowGraph::delete_nodes(
 }
 
 bool StateFlowGraph::optimize_dead_store() {
+  TI_AUTO_PROF
   bool modified = false;
 
   for (int i = 1; i < nodes_.size(); i++) {
@@ -881,6 +887,7 @@ bool StateFlowGraph::optimize_dead_store() {
 }
 
 void StateFlowGraph::verify() {
+  TI_AUTO_PROF
   const int n = nodes_.size();
   reid_nodes();
   for (int i = 0; i < n; i++) {
@@ -936,6 +943,7 @@ void StateFlowGraph::verify() {
 }
 
 bool StateFlowGraph::demote_activation() {
+  TI_AUTO_PROF
   bool modified = false;
 
   topo_sort_nodes();