diff --git a/taichi/runtime/llvm/internal_function.h b/taichi/runtime/llvm/internal_function.h
index 85382ca0a4dac..6ae069eb847f1 100644
--- a/taichi/runtime/llvm/internal_function.h
+++ b/taichi/runtime/llvm/internal_function.h
@@ -1,3 +1,14 @@
+// Note that TI_ASSERT provided by runtime doesn't check fail immediately. As
+// a hack, we just check the last error code whenever we call TI_ASSERT.
+#define TI_TEST_CHECK(cond, r)                             \
+  do {                                                     \
+    TI_ASSERT(cond);                                       \
+    if ((r)->error_code) {                                 \
+      taichi_printf((r), "%s", (r)->error_message_buffer); \
+      abort();                                             \
+    }                                                      \
+  } while (0)
+
 i32 do_nothing(Context *context) {
   return 0;
 }
@@ -28,7 +39,7 @@ i32 test_list_manager(Context *context) {
     list->append(&j);
   }
   for (int i = 0; i < 320; i++) {
-    TI_ASSERT(list->get<i32>(i) == i + 5);
+    TI_TEST_CHECK(list->get<i32>(i) == i + 5, runtime);
   }
   return 0;
 }
@@ -54,14 +65,60 @@ i32 test_node_allocator(Context *context) {
     ptrs[i] = nodes->allocate();
   }
   for (int i = 5; i < 19; i++) {
-    TI_ASSERT(nodes->locate(ptrs[i]) == i);
+    TI_TEST_CHECK(nodes->locate(ptrs[i]) == i, runtime);
   }
 
   for (int i = 19; i < 24; i++) {
     auto idx = nodes->locate(ptrs[i]);
     taichi_printf(runtime, "i %d", i);
     taichi_printf(runtime, "idx %d", idx);
-    TI_ASSERT(idx == i - 19);
+    TI_TEST_CHECK(idx == i - 19, runtime);
+  }
+  return 0;
+}
+
+i32 test_node_allocator_gc_cpu(Context *context) {
+  auto runtime = context->runtime;
+  taichi_printf(runtime, "LLVMRuntime %p\n", runtime);
+  auto nodes = context->runtime->create<NodeManager>(runtime, sizeof(i64), 4);
+  constexpr int kN = 24;
+  constexpr int kHalfN = kN / 2;
+  Ptr ptrs[kN];
+  // Initially |free_list| is empty
+  TI_TEST_CHECK(nodes->free_list->size() == 0, runtime);
+  for (int i = 0; i < kN; i++) {
+    taichi_printf(runtime, "[1] allocating %d\n", i);
+    ptrs[i] = nodes->allocate();
+    taichi_printf(runtime, "[1] ptr %p\n", ptrs[i]);
+  }
+  for (int i = 0; i < kN; i++) {
+    taichi_printf(runtime, "[1] deallocating %d\n", i);
+    taichi_printf(runtime, "[1] ptr %p\n", ptrs[i]);
+    nodes->recycle(ptrs[i]);
+  }
+  TI_TEST_CHECK(nodes->free_list->size() == 0, runtime);
+  nodes->gc_serial();
+  // After the first round GC, |free_list| should have |kN| items.
+  TI_TEST_CHECK(nodes->free_list->size() == kN, runtime);
+
+  // In the second round, all items should come from |free_list|.
+  for (int i = 0; i < kHalfN; i++) {
+    taichi_printf(runtime, "[2] allocating %d\n", i);
+    ptrs[i] = nodes->allocate();
+    taichi_printf(runtime, "[2] ptr %p\n", ptrs[i]);
   }
+  TI_TEST_CHECK(nodes->free_list_used == kHalfN, runtime);
+  for (int i = 0; i < kHalfN; i++) {
+    taichi_printf(runtime, "[2] deallocating %d\n", i);
+    taichi_printf(runtime, "[2] ptr %p\n", ptrs[i]);
+    nodes->recycle(ptrs[i]);
+  }
+  nodes->gc_serial();
+  // After GC, all items should be returned to |free_list|.
+  printf("free_list_size=%d\n", nodes->free_list->size());
+  TI_TEST_CHECK(nodes->free_list->size() == kN, runtime);
+
   return 0;
 }
+
+#undef TI_TEST_CHECK
diff --git a/taichi/runtime/llvm/runtime.cpp b/taichi/runtime/llvm/runtime.cpp
index 44a297af30325..cd7fe6e434ac4 100644
--- a/taichi/runtime/llvm/runtime.cpp
+++ b/taichi/runtime/llvm/runtime.cpp
@@ -409,6 +409,10 @@ struct ListManager {
     num_elements = 0;
   }
 
+  void resize(i32 n) {
+    num_elements = n;
+  }
+
   Ptr get_element_ptr(i32 i) {
     return chunks[i >> log2chunk_num_elements] +
            element_size * (i & ((1 << log2chunk_num_elements) - 1));
@@ -612,8 +616,9 @@ struct NodeManager {
       free_list->get<list_data_type>(i - free_list_used) =
           free_list->get<list_data_type>(i);
     }
+    const i32 num_unused = max_i32(free_list->size() - free_list_used, 0);
     free_list_used = 0;
-    free_list->clear();
+    free_list->resize(num_unused);
 
     // zero-fill recycled and push to free list
     for (int i = 0; i < recycled_list->size(); i++) {
diff --git a/tests/python/test_internal_func.py b/tests/python/test_internal_func.py
index 16fba18bee3af..dee7292e926fc 100644
--- a/tests/python/test_internal_func.py
+++ b/tests/python/test_internal_func.py
@@ -50,3 +50,11 @@ def test():
 
     test()
     test()
+
+
+def test_node_manager_gc():
+    @ti.kernel
+    def test_cpu():
+        ti.call_internal("test_node_allocator_gc_cpu")
+
+    test_cpu()