diff --git a/NEWS.md b/NEWS.md
index 695867f2e4f63..18a5b0e620e7d 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -38,6 +38,7 @@ Compiler/Runtime improvements
 * All uses of the `@pure` macro in `Base` have been replaced with the now-preferred `Base.@assume_effects` ([#44776]).
 * `invoke(f, invokesig, args...)` calls to a less-specific method than would normally be chosen
   for `f(args...)` are no longer spuriously invalidated when loading package precompile files ([#46010]).
+* The mark phase of the Garbage Collector is now multi-threaded ([#48600]).
 
 Command-line option changes
 ---------------------------
@@ -49,6 +50,8 @@ Command-line option changes
   number of interactive threads to create (`auto` currently means 1) ([#42302]).
 * New option `--heap-size-hint=<size>` suggests a size limit to invoke garbage collection more eagerly.
   The size may be specified in bytes, kilobytes (1000k), megabytes (300M), or gigabytes (1.5G) ([#45369]).
+* New option `--gcthreads` to set how many threads will be used by the Garbage Collector ([#48600]).
+  The default is set to `N/2` where `N` is the amount of worker threads (`--threads`) used by Julia.
 
 Multi-threading changes
 -----------------------
diff --git a/base/options.jl b/base/options.jl
index b9fa5b6ec5508..fb043672dc19a 100644
--- a/base/options.jl
+++ b/base/options.jl
@@ -11,6 +11,7 @@ struct JLOptions
     cpu_target::Ptr{UInt8}
     nthreadpools::Int16
     nthreads::Int16
+    ngcthreads::Int16
     nthreads_per_pool::Ptr{Int16}
     nprocs::Int32
     machine_file::Ptr{UInt8}
diff --git a/base/threadingconstructs.jl b/base/threadingconstructs.jl
index 0c6563c54f99f..ce216f290ae12 100644
--- a/base/threadingconstructs.jl
+++ b/base/threadingconstructs.jl
@@ -136,6 +136,13 @@ function threadpooltids(pool::Symbol)
     end
 end
 
+"""
+    Threads.ngcthreads() -> Int
+
+Returns the number of GC threads currently configured.
+"""
+ngcthreads() = Int(unsafe_load(cglobal(:jl_n_gcthreads, Cint))) + 1
+
 function threading_run(fun, static)
     ccall(:jl_enter_threaded_region, Cvoid, ())
     n = threadpoolsize()
diff --git a/doc/man/julia.1 b/doc/man/julia.1
index 383c588c58dae..fa9f641b1e76f 100644
--- a/doc/man/julia.1
+++ b/doc/man/julia.1
@@ -118,6 +118,11 @@ supported (Linux and Windows). If this is not supported (macOS) or
 process affinity is not configured, it uses the number of CPU
 threads.
 
+.TP
+--gcthreads <n>
+Enable n GC threads; If unspecified is set to half of the
+compute worker threads.
+
 .TP
 -p, --procs {N|auto}
 Integer value N launches N additional local worker processes `auto` launches as many workers
diff --git a/doc/src/base/multi-threading.md b/doc/src/base/multi-threading.md
index 4932aef4cc938..fb75b21479707 100644
--- a/doc/src/base/multi-threading.md
+++ b/doc/src/base/multi-threading.md
@@ -10,6 +10,7 @@ Base.Threads.nthreads
 Base.Threads.threadpool
 Base.Threads.nthreadpools
 Base.Threads.threadpoolsize
+Base.Threads.ngcthreads
 ```
 
 See also [Multi-Threading](@ref man-multithreading).
diff --git a/doc/src/manual/command-line-interface.md b/doc/src/manual/command-line-interface.md
index 54c56a354c7a3..781a77a33dadb 100644
--- a/doc/src/manual/command-line-interface.md
+++ b/doc/src/manual/command-line-interface.md
@@ -106,8 +106,9 @@ The following is a complete list of command-line switches available when launchi
 |`-e`, `--eval <expr>`                  |Evaluate `<expr>`|
 |`-E`, `--print <expr>`                 |Evaluate `<expr>` and display the result|
 |`-L`, `--load <file>`                  |Load `<file>` immediately on all processors|
-|`-t`, `--threads {N\|auto`}            |Enable N threads; `auto` tries to infer a useful default number of threads to use but the exact behavior might change in the future.  Currently, `auto` uses the number of CPUs assigned to this julia process based on the OS-specific affinity assignment interface, if supported (Linux and Windows). If this is not supported (macOS) or process affinity is not configured, it uses the number of CPU threads.|
-|`-p`, `--procs {N\|auto`}              |Integer value N launches N additional local worker processes; `auto` launches as many workers as the number of local CPU threads (logical cores)|
+|`-t`, `--threads {N\|auto}`            |Enable N threads; `auto` tries to infer a useful default number of threads to use but the exact behavior might change in the future.  Currently, `auto` uses the number of CPUs assigned to this julia process based on the OS-specific affinity assignment interface, if supported (Linux and Windows). If this is not supported (macOS) or process affinity is not configured, it uses the number of CPU threads.|
+| `--gcthreads {N}`                     |Enable N GC threads; If unspecified is set to half of the compute worker threads.|
+|`-p`, `--procs {N\|auto}`              |Integer value N launches N additional local worker processes; `auto` launches as many workers as the number of local CPU threads (logical cores)|
 |`--machine-file <file>`                |Run processes on hosts listed in `<file>`|
 |`-i`                                   |Interactive mode; REPL runs and `isinteractive()` is true|
 |`-q`, `--quiet`                        |Quiet startup: no banner, suppress REPL warnings|
diff --git a/doc/src/manual/environment-variables.md b/doc/src/manual/environment-variables.md
index f29e5b7aaf8f7..68fb79fc0a9a6 100644
--- a/doc/src/manual/environment-variables.md
+++ b/doc/src/manual/environment-variables.md
@@ -315,6 +315,14 @@ then spinning threads never sleep. Otherwise, `$JULIA_THREAD_SLEEP_THRESHOLD` is
 interpreted as an unsigned 64-bit integer (`uint64_t`) and gives, in
 nanoseconds, the amount of time after which spinning threads should sleep.
 
+### [`JULIA_NUM_GC_THREADS`](@id env-gc-threads)
+
+Sets the number of threads used by Garbage Collection. If unspecified is set to
+half of the number of worker threads.
+
+!!! compat "Julia 1.10"
+    The environment variable was added in 1.10
+
 ### `JULIA_EXCLUSIVE`
 
 If set to anything besides `0`, then Julia's thread policy is consistent with
diff --git a/doc/src/manual/multi-threading.md b/doc/src/manual/multi-threading.md
index f8c6b040941f4..e1cb17bd5b93f 100644
--- a/doc/src/manual/multi-threading.md
+++ b/doc/src/manual/multi-threading.md
@@ -72,6 +72,15 @@ julia> Threads.threadid()
     three processes have 2 threads enabled. For more fine grained control over worker
     threads use [`addprocs`](@ref) and pass `-t`/`--threads` as `exeflags`.
 
+### Multiple GC Threads
+
+The Garbage Collector (GC) can use multiple threads. The amount used is either half the number
+of compute worker threads or configured by either the `--gcthreads` command line argument or by using the
+[`JULIA_NUM_GC_THREADS`](@ref env-gc-threads) environment variable.
+
+!!! compat "Julia 1.10"
+    The `--gcthreads` command line argument requires at least Julia 1.10.
+
 ## [Threadpools](@id man-threadpools)
 
 When a program's threads are busy with many tasks to run, tasks may experience
diff --git a/src/Makefile b/src/Makefile
index 6f10bf505e2ef..51d8bf1f2de65 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -99,7 +99,7 @@ ifeq ($(USE_SYSTEM_LIBUV),0)
 UV_HEADERS += uv.h
 UV_HEADERS += uv/*.h
 endif
-PUBLIC_HEADERS := $(BUILDDIR)/julia_version.h $(wildcard $(SRCDIR)/support/*.h) $(addprefix $(SRCDIR)/,julia.h julia_assert.h julia_threads.h julia_fasttls.h julia_locks.h julia_atomics.h jloptions.h)
+PUBLIC_HEADERS := $(BUILDDIR)/julia_version.h $(wildcard $(SRCDIR)/support/*.h) $(addprefix $(SRCDIR)/,work-stealing-queue.h julia.h julia_assert.h julia_threads.h julia_fasttls.h julia_locks.h julia_atomics.h jloptions.h)
 ifeq ($(OS),WINNT)
 PUBLIC_HEADERS += $(addprefix $(SRCDIR)/,win32_ucontext.h)
 endif
diff --git a/src/gc-debug.c b/src/gc-debug.c
index 3f60ca17e0dc4..0f4120920ce76 100644
--- a/src/gc-debug.c
+++ b/src/gc-debug.c
@@ -198,21 +198,32 @@ static void restore(void)
 
 static void gc_verify_track(jl_ptls_t ptls)
 {
-    jl_gc_mark_cache_t *gc_cache = &ptls->gc_cache;
+    // `gc_verify_track` is limited to single-threaded GC
+    if (jl_n_gcthreads != 0)
+        return;
     do {
-        jl_gc_mark_sp_t sp;
-        gc_mark_sp_init(gc_cache, &sp);
+        jl_gc_markqueue_t mq;
+        jl_gc_markqueue_t *mq2 = &ptls->mark_queue;
+        ws_queue_t *cq = &mq.chunk_queue;
+        ws_queue_t *q = &mq.ptr_queue;
+        jl_atomic_store_relaxed(&cq->top, 0);
+        jl_atomic_store_relaxed(&cq->bottom, 0);
+        jl_atomic_store_relaxed(&cq->array, jl_atomic_load_relaxed(&mq2->chunk_queue.array));
+        jl_atomic_store_relaxed(&q->top, 0);
+        jl_atomic_store_relaxed(&q->bottom, 0);
+        jl_atomic_store_relaxed(&q->array, jl_atomic_load_relaxed(&mq2->ptr_queue.array));
+        arraylist_new(&mq.reclaim_set, 32);
         arraylist_push(&lostval_parents_done, lostval);
         jl_safe_printf("Now looking for %p =======\n", lostval);
         clear_mark(GC_CLEAN);
-        gc_mark_queue_all_roots(ptls, &sp);
-        gc_mark_queue_finlist(gc_cache, &sp, &to_finalize, 0);
-        for (int i = 0; i < gc_n_threads; i++) {
+        gc_mark_queue_all_roots(ptls, &mq);
+        gc_mark_finlist(&mq, &to_finalize, 0);
+        for (int i = 0; i < gc_n_threads;i++) {
             jl_ptls_t ptls2 = gc_all_tls_states[i];
-            gc_mark_queue_finlist(gc_cache, &sp, &ptls2->finalizers, 0);
+            gc_mark_finlist(&mq, &ptls2->finalizers, 0);
         }
-        gc_mark_queue_finlist(gc_cache, &sp, &finalizer_list_marked, 0);
-        gc_mark_loop(ptls, sp);
+        gc_mark_finlist(&mq, &finalizer_list_marked, 0);
+        gc_mark_loop_serial_(ptls, &mq);
         if (lostval_parents.len == 0) {
             jl_safe_printf("Could not find the missing link. We missed a toplevel root. This is odd.\n");
             break;
@@ -246,22 +257,35 @@ static void gc_verify_track(jl_ptls_t ptls)
 
 void gc_verify(jl_ptls_t ptls)
 {
-    jl_gc_mark_cache_t *gc_cache = &ptls->gc_cache;
-    jl_gc_mark_sp_t sp;
-    gc_mark_sp_init(gc_cache, &sp);
+    // `gc_verify` is limited to single-threaded GC
+    if (jl_n_gcthreads != 0) {
+        jl_safe_printf("Warn. GC verify disabled in multi-threaded GC\n");
+        return;
+    }
+    jl_gc_markqueue_t mq;
+    jl_gc_markqueue_t *mq2 = &ptls->mark_queue;
+    ws_queue_t *cq = &mq.chunk_queue;
+    ws_queue_t *q = &mq.ptr_queue;
+    jl_atomic_store_relaxed(&cq->top, 0);
+    jl_atomic_store_relaxed(&cq->bottom, 0);
+    jl_atomic_store_relaxed(&cq->array, jl_atomic_load_relaxed(&mq2->chunk_queue.array));
+    jl_atomic_store_relaxed(&q->top, 0);
+    jl_atomic_store_relaxed(&q->bottom, 0);
+    jl_atomic_store_relaxed(&q->array, jl_atomic_load_relaxed(&mq2->ptr_queue.array));
+    arraylist_new(&mq.reclaim_set, 32);
     lostval = NULL;
     lostval_parents.len = 0;
     lostval_parents_done.len = 0;
     clear_mark(GC_CLEAN);
     gc_verifying = 1;
-    gc_mark_queue_all_roots(ptls, &sp);
-    gc_mark_queue_finlist(gc_cache, &sp, &to_finalize, 0);
-    for (int i = 0; i < gc_n_threads; i++) {
+    gc_mark_queue_all_roots(ptls, &mq);
+    gc_mark_finlist(&mq, &to_finalize, 0);
+    for (int i = 0; i < gc_n_threads;i++) {
         jl_ptls_t ptls2 = gc_all_tls_states[i];
-        gc_mark_queue_finlist(gc_cache, &sp, &ptls2->finalizers, 0);
+        gc_mark_finlist(&mq, &ptls2->finalizers, 0);
     }
-    gc_mark_queue_finlist(gc_cache, &sp, &finalizer_list_marked, 0);
-    gc_mark_loop(ptls, sp);
+    gc_mark_finlist(&mq, &finalizer_list_marked, 0);
+    gc_mark_loop_serial_(ptls, &mq);
     int clean_len = bits_save[GC_CLEAN].len;
     for(int i = 0; i < clean_len + bits_save[GC_OLD].len; i++) {
         jl_taggedvalue_t *v = (jl_taggedvalue_t*)bits_save[i >= clean_len ? GC_OLD : GC_CLEAN].items[i >= clean_len ? i - clean_len : i];
@@ -500,7 +524,7 @@ int jl_gc_debug_check_other(void)
     return gc_debug_alloc_check(&jl_gc_debug_env.other);
 }
 
-void jl_gc_debug_print_status(void)
+void jl_gc_debug_print_status(void) JL_NOTSAFEPOINT
 {
     uint64_t pool_count = jl_gc_debug_env.pool.num;
     uint64_t other_count = jl_gc_debug_env.other.num;
@@ -509,7 +533,7 @@ void jl_gc_debug_print_status(void)
                    pool_count + other_count, pool_count, other_count, gc_num.pause);
 }
 
-void jl_gc_debug_critical_error(void)
+void jl_gc_debug_critical_error(void) JL_NOTSAFEPOINT
 {
     jl_gc_debug_print_status();
     if (!jl_gc_debug_env.wait_for_debugger)
@@ -1264,139 +1288,6 @@ int gc_slot_to_arrayidx(void *obj, void *_slot) JL_NOTSAFEPOINT
     return (slot - start) / elsize;
 }
 
-// Print a backtrace from the bottom (start) of the mark stack up to `sp`
-// `pc_offset` will be added to `sp` for convenience in the debugger.
-NOINLINE void gc_mark_loop_unwind(jl_ptls_t ptls, jl_gc_mark_sp_t sp, int pc_offset)
-{
-    jl_jmp_buf *old_buf = jl_get_safe_restore();
-    jl_jmp_buf buf;
-    jl_set_safe_restore(&buf);
-    if (jl_setjmp(buf, 0) != 0) {
-        jl_safe_printf("\n!!! ERROR when unwinding gc mark loop -- ABORTING !!!\n");
-        jl_set_safe_restore(old_buf);
-        return;
-    }
-    void **top = sp.pc + pc_offset;
-    jl_gc_mark_data_t *data_top = sp.data;
-    sp.data = ptls->gc_cache.data_stack;
-    sp.pc = ptls->gc_cache.pc_stack;
-    int isroot = 1;
-    while (sp.pc < top) {
-        void *pc = *sp.pc;
-        const char *prefix = isroot ? "r--" : " `-";
-        isroot = 0;
-        if (pc == gc_mark_label_addrs[GC_MARK_L_marked_obj]) {
-            gc_mark_marked_obj_t *data = gc_repush_markdata(&sp, gc_mark_marked_obj_t);
-            if ((jl_gc_mark_data_t *)data > data_top) {
-                jl_safe_printf("Mark stack unwind overflow -- ABORTING !!!\n");
-                break;
-            }
-            jl_safe_printf("%p: Root object: %p :: %p (bits: %d)\n        of type ",
-                           (void*)data, (void*)data->obj, (void*)data->tag, (int)data->bits);
-            jl_((void*)data->tag);
-            isroot = 1;
-        }
-        else if (pc == gc_mark_label_addrs[GC_MARK_L_scan_only]) {
-            gc_mark_marked_obj_t *data = gc_repush_markdata(&sp, gc_mark_marked_obj_t);
-            if ((jl_gc_mark_data_t *)data > data_top) {
-                jl_safe_printf("Mark stack unwind overflow -- ABORTING !!!\n");
-                break;
-            }
-            jl_safe_printf("%p: Queued root: %p :: %p (bits: %d)\n        of type ",
-                           (void*)data, (void*)data->obj, (void*)data->tag, (int)data->bits);
-            jl_((void*)data->tag);
-            isroot = 1;
-        }
-        else if (pc == gc_mark_label_addrs[GC_MARK_L_finlist]) {
-            gc_mark_finlist_t *data = gc_repush_markdata(&sp, gc_mark_finlist_t);
-            if ((jl_gc_mark_data_t *)data > data_top) {
-                jl_safe_printf("Mark stack unwind overflow -- ABORTING !!!\n");
-                break;
-            }
-            jl_safe_printf("%p: Finalizer list from %p to %p\n",
-                           (void*)data, (void*)data->begin, (void*)data->end);
-            isroot = 1;
-        }
-        else if (pc == gc_mark_label_addrs[GC_MARK_L_objarray]) {
-            gc_mark_objarray_t *data = gc_repush_markdata(&sp, gc_mark_objarray_t);
-            if ((jl_gc_mark_data_t *)data > data_top) {
-                jl_safe_printf("Mark stack unwind overflow -- ABORTING !!!\n");
-                break;
-            }
-            jl_safe_printf("%p:  %s Array in object %p :: %p -- [%p, %p)\n        of type ",
-                           (void*)data, prefix, (void*)data->parent, ((void**)data->parent)[-1],
-                           (void*)data->begin, (void*)data->end);
-            jl_(jl_typeof(data->parent));
-        }
-        else if (pc == gc_mark_label_addrs[GC_MARK_L_obj8]) {
-            gc_mark_obj8_t *data = gc_repush_markdata(&sp, gc_mark_obj8_t);
-            if ((jl_gc_mark_data_t *)data > data_top) {
-                jl_safe_printf("Mark stack unwind overflow -- ABORTING !!!\n");
-                break;
-            }
-            jl_datatype_t *vt = (jl_datatype_t*)jl_typeof(data->parent);
-            uint8_t *desc = (uint8_t*)jl_dt_layout_ptrs(vt->layout);
-            jl_safe_printf("%p:  %s Object (8bit) %p :: %p -- [%d, %d)\n        of type ",
-                           (void*)data, prefix, (void*)data->parent, ((void**)data->parent)[-1],
-                           (int)(data->begin - desc), (int)(data->end - desc));
-            jl_(jl_typeof(data->parent));
-        }
-        else if (pc == gc_mark_label_addrs[GC_MARK_L_obj16]) {
-            gc_mark_obj16_t *data = gc_repush_markdata(&sp, gc_mark_obj16_t);
-            if ((jl_gc_mark_data_t *)data > data_top) {
-                jl_safe_printf("Mark stack unwind overflow -- ABORTING !!!\n");
-                break;
-            }
-            jl_datatype_t *vt = (jl_datatype_t*)jl_typeof(data->parent);
-            uint16_t *desc = (uint16_t*)jl_dt_layout_ptrs(vt->layout);
-            jl_safe_printf("%p:  %s Object (16bit) %p :: %p -- [%d, %d)\n        of type ",
-                           (void*)data, prefix, (void*)data->parent, ((void**)data->parent)[-1],
-                           (int)(data->begin - desc), (int)(data->end - desc));
-            jl_(jl_typeof(data->parent));
-        }
-        else if (pc == gc_mark_label_addrs[GC_MARK_L_obj32]) {
-            gc_mark_obj32_t *data = gc_repush_markdata(&sp, gc_mark_obj32_t);
-            if ((jl_gc_mark_data_t *)data > data_top) {
-                jl_safe_printf("Mark stack unwind overflow -- ABORTING !!!\n");
-                break;
-            }
-            jl_datatype_t *vt = (jl_datatype_t*)jl_typeof(data->parent);
-            uint32_t *desc = (uint32_t*)jl_dt_layout_ptrs(vt->layout);
-            jl_safe_printf("%p:  %s Object (32bit) %p :: %p -- [%d, %d)\n        of type ",
-                           (void*)data, prefix, (void*)data->parent, ((void**)data->parent)[-1],
-                           (int)(data->begin - desc), (int)(data->end - desc));
-            jl_(jl_typeof(data->parent));
-        }
-        else if (pc == gc_mark_label_addrs[GC_MARK_L_stack]) {
-            gc_mark_stackframe_t *data = gc_repush_markdata(&sp, gc_mark_stackframe_t);
-            if ((jl_gc_mark_data_t *)data > data_top) {
-                jl_safe_printf("Mark stack unwind overflow -- ABORTING !!!\n");
-                break;
-            }
-            jl_safe_printf("%p:  %s Stack frame %p -- %d of %d (%s)\n",
-                           (void*)data, prefix, (void*)data->s, (int)data->i,
-                           (int)data->nroots >> 1,
-                           (data->nroots & 1) ? "indirect" : "direct");
-        }
-        else if (pc == gc_mark_label_addrs[GC_MARK_L_module_binding]) {
-            // module_binding
-            gc_mark_binding_t *data = gc_repush_markdata(&sp, gc_mark_binding_t);
-            if ((jl_gc_mark_data_t *)data > data_top) {
-                jl_safe_printf("Mark stack unwind overflow -- ABORTING !!!\n");
-                break;
-            }
-            jl_safe_printf("%p:  %s Module (bindings) %p (bits %d) -- [%p, %p)\n",
-                           (void*)data, prefix, (void*)data->parent, (int)data->bits,
-                           (void*)data->begin, (void*)data->end);
-        }
-        else {
-            jl_safe_printf("Unknown pc %p --- ABORTING !!!\n", pc);
-            break;
-        }
-    }
-    jl_set_safe_restore(old_buf);
-}
-
 static int gc_logging_enabled = 0;
 
 JL_DLLEXPORT void jl_enable_gc_logging(int enable) {
diff --git a/src/gc.c b/src/gc.c
index 341fae2b02553..027c551402b3f 100644
--- a/src/gc.c
+++ b/src/gc.c
@@ -11,6 +11,18 @@
 extern "C" {
 #endif
 
+// `tid` of mutator thread that triggered GC
+_Atomic(int) gc_master_tid;
+// `tid` of first GC thread
+int gc_first_tid;
+
+// Mutex/cond used to synchronize sleep/wakeup of GC threads
+uv_mutex_t gc_threads_lock;
+uv_cond_t gc_threads_cond;
+
+// Number of threads currently running the GC mark-loop
+_Atomic(int) gc_n_threads_marking;
+
 // Linked list of callback functions
 
 typedef void (*jl_gc_cb_func_t)(void);
@@ -112,17 +124,6 @@ JL_DLLEXPORT void jl_gc_set_cb_notify_external_free(jl_gc_cb_notify_external_fre
         jl_gc_deregister_callback(&gc_cblist_notify_external_free, (jl_gc_cb_func_t)cb);
 }
 
-// Save/restore local mark stack to/from thread-local storage.
-
-STATIC_INLINE void export_gc_state(jl_ptls_t ptls, jl_gc_mark_sp_t *sp) {
-    ptls->gc_mark_sp = *sp;
-}
-
-STATIC_INLINE void import_gc_state(jl_ptls_t ptls, jl_gc_mark_sp_t *sp) {
-    // Has the stack been reallocated in the meantime?
-    *sp = ptls->gc_mark_sp;
-}
-
 // Protect all access to `finalizer_list_marked` and `to_finalize`.
 // For accessing `ptls->finalizers`, the lock is needed if a thread
 // is going to realloc the buffer (of its own list) or accessing the
@@ -327,16 +328,16 @@ void jl_gc_wait_for_the_world(jl_ptls_t* gc_all_tls_states, int gc_n_threads)
         jl_wake_libuv();
     for (int i = 0; i < gc_n_threads; i++) {
         jl_ptls_t ptls2 = gc_all_tls_states[i];
-        if (ptls2 == NULL)
-            continue;
-        // This acquire load pairs with the release stores
-        // in the signal handler of safepoint so we are sure that
-        // all the stores on those threads are visible.
-        // We're currently also using atomic store release in mutator threads
-        // (in jl_gc_state_set), but we may want to use signals to flush the
-        // memory operations on those threads lazily instead.
-        while (!jl_atomic_load_relaxed(&ptls2->gc_state) || !jl_atomic_load_acquire(&ptls2->gc_state))
-            jl_cpu_pause(); // yield?
+        if (ptls2 != NULL) {
+            // This acquire load pairs with the release stores
+            // in the signal handler of safepoint so we are sure that
+            // all the stores on those threads are visible.
+            // We're currently also using atomic store release in mutator threads
+            // (in jl_gc_state_set), but we may want to use signals to flush the
+            // memory operations on those threads lazily instead.
+            while (!jl_atomic_load_relaxed(&ptls2->gc_state) || !jl_atomic_load_acquire(&ptls2->gc_state))
+                jl_cpu_pause(); // yield?
+        }
     }
 }
 
@@ -644,7 +645,7 @@ void jl_gc_run_all_finalizers(jl_task_t *ct)
     schedule_all_finalizers(&finalizer_list_marked);
     for (int i = 0; i < gc_n_threads; i++) {
         jl_ptls_t ptls2 = gc_all_tls_states[i];
-        if (ptls2)
+        if (ptls2 != NULL)
             schedule_all_finalizers(&ptls2->finalizers);
     }
     // unlock here because `run_finalizers` locks this
@@ -720,7 +721,7 @@ JL_DLLEXPORT void jl_finalize_th(jl_task_t *ct, jl_value_t *o)
     gc_all_tls_states = jl_atomic_load_relaxed(&jl_all_tls_states);
     for (int i = 0; i < gc_n_threads; i++) {
         jl_ptls_t ptls2 = gc_all_tls_states[i];
-        if (ptls2)
+        if (ptls2 != NULL)
             finalize_object(&ptls2->finalizers, o, &copied_list, jl_atomic_load_relaxed(&ct->tid) != i);
     }
     finalize_object(&finalizer_list_marked, o, &copied_list, 0);
@@ -760,7 +761,7 @@ static void gc_sweep_foreign_objs(void)
     assert(gc_n_threads);
     for (int i = 0; i < gc_n_threads; i++) {
         jl_ptls_t ptls2 = gc_all_tls_states[i];
-        if (ptls2)
+        if (ptls2 != NULL)
             gc_sweep_foreign_objs_in_list(&ptls2->sweep_objs);
     }
 }
@@ -894,7 +895,7 @@ static void gc_sync_all_caches_nolock(jl_ptls_t ptls)
     assert(gc_n_threads);
     for (int t_i = 0; t_i < gc_n_threads; t_i++) {
         jl_ptls_t ptls2 = gc_all_tls_states[t_i];
-        if (ptls2)
+        if (ptls2 != NULL)
             gc_sync_cache_nolock(ptls, &ptls2->gc_cache);
     }
 }
@@ -913,21 +914,13 @@ STATIC_INLINE void gc_queue_big_marked(jl_ptls_t ptls, bigval_t *hdr,
     ptls->gc_cache.nbig_obj = nobj + 1;
 }
 
-// `gc_setmark_tag` can be called concurrently on multiple threads.
-// In all cases, the function atomically sets the mark bits and returns
-// the GC bits set as well as if the tag was unchanged by this thread.
-// All concurrent calls on the same object are guaranteed to be setting the
-// bits to the same value.
-// For normal objects, this is the bits with only `GC_MARKED` changed to `1`
-// For buffers, this is the bits of the owner object.
-// For `mark_reset_age`, this is `GC_MARKED` with `GC_OLD` cleared.
-// The return value is `1` if the object was not marked before.
-// Returning `0` can happen if another thread marked it in parallel.
-STATIC_INLINE int gc_setmark_tag(jl_taggedvalue_t *o, uint8_t mark_mode,
-                                 uintptr_t tag, uint8_t *bits) JL_NOTSAFEPOINT
-{
-    assert(!gc_marked(tag));
+// Atomically set the mark bit for object and return whether it was previously unmarked
+FORCE_INLINE int gc_try_setmark_tag(jl_taggedvalue_t *o, uint8_t mark_mode) JL_NOTSAFEPOINT
+{
     assert(gc_marked(mark_mode));
+    uintptr_t tag = o->header;
+    if (gc_marked(tag))
+        return 0;
     if (mark_reset_age) {
         // Reset the object as if it was just allocated
         mark_mode = GC_MARKED;
@@ -939,7 +932,6 @@ STATIC_INLINE int gc_setmark_tag(jl_taggedvalue_t *o, uint8_t mark_mode,
         tag = tag | mark_mode;
         assert((tag & 0x3) == mark_mode);
     }
-    *bits = mark_mode;
     tag = jl_atomic_exchange_relaxed((_Atomic(uintptr_t)*)&o->header, tag);
     verify_val(jl_valueof(o));
     return !gc_marked(tag);
@@ -1022,15 +1014,12 @@ STATIC_INLINE void gc_setmark(jl_ptls_t ptls, jl_taggedvalue_t *o,
 STATIC_INLINE void gc_setmark_buf_(jl_ptls_t ptls, void *o, uint8_t mark_mode, size_t minsz) JL_NOTSAFEPOINT
 {
     jl_taggedvalue_t *buf = jl_astaggedvalue(o);
-    uintptr_t tag = buf->header;
-    if (gc_marked(tag))
-        return;
-    uint8_t bits;
+    uint8_t bits = (gc_old(buf->header) && !mark_reset_age) ? GC_OLD_MARKED : GC_MARKED;;
     // If the object is larger than the max pool size it can't be a pool object.
     // This should be accurate most of the time but there might be corner cases
     // where the size estimate is a little off so we do a pool lookup to make
     // sure.
-    if (__likely(gc_setmark_tag(buf, mark_mode, tag, &bits)) && !gc_verifying) {
+    if (__likely(gc_try_setmark_tag(buf, mark_mode)) && !gc_verifying) {
         if (minsz <= GC_MAX_SZCLASS) {
             jl_gc_pagemeta_t *page = page_metadata(buf);
             if (page) {
@@ -1078,7 +1067,7 @@ void jl_gc_force_mark_old(jl_ptls_t ptls, jl_value_t *v) JL_NOTSAFEPOINT
         jl_gc_queue_root(v);
 }
 
-static inline void maybe_collect(jl_ptls_t ptls)
+STATIC_INLINE void maybe_collect(jl_ptls_t ptls)
 {
     if (jl_atomic_load_relaxed(&ptls->gc_num.allocd) >= 0 || jl_gc_debug_check_other()) {
         jl_gc_collect(JL_GC_AUTO);
@@ -1105,14 +1094,14 @@ static void clear_weak_refs(void)
     assert(gc_n_threads);
     for (int i = 0; i < gc_n_threads; i++) {
         jl_ptls_t ptls2 = gc_all_tls_states[i];
-        if (ptls2 == NULL)
-            continue;
-        size_t n, l = ptls2->heap.weak_refs.len;
-        void **lst = ptls2->heap.weak_refs.items;
-        for (n = 0; n < l; n++) {
-            jl_weakref_t *wr = (jl_weakref_t*)lst[n];
-            if (!gc_marked(jl_astaggedvalue(wr->value)->bits.gc))
-                wr->value = (jl_value_t*)jl_nothing;
+        if (ptls2 != NULL) {
+            size_t n, l = ptls2->heap.weak_refs.len;
+            void **lst = ptls2->heap.weak_refs.items;
+            for (n = 0; n < l; n++) {
+                jl_weakref_t *wr = (jl_weakref_t*)lst[n];
+                if (!gc_marked(jl_astaggedvalue(wr->value)->bits.gc))
+                    wr->value = (jl_value_t*)jl_nothing;
+            }
         }
     }
 }
@@ -1122,27 +1111,27 @@ static void sweep_weak_refs(void)
     assert(gc_n_threads);
     for (int i = 0; i < gc_n_threads; i++) {
         jl_ptls_t ptls2 = gc_all_tls_states[i];
-        if (ptls2 == NULL)
-            continue;
-        size_t n = 0;
-        size_t ndel = 0;
-        size_t l = ptls2->heap.weak_refs.len;
-        void **lst = ptls2->heap.weak_refs.items;
-        if (l == 0)
-            continue;
-        while (1) {
-            jl_weakref_t *wr = (jl_weakref_t*)lst[n];
-            if (gc_marked(jl_astaggedvalue(wr)->bits.gc))
-                n++;
-            else
-                ndel++;
-            if (n >= l - ndel)
-                break;
-            void *tmp = lst[n];
-            lst[n] = lst[n + ndel];
-            lst[n + ndel] = tmp;
+        if (ptls2 != NULL) {
+            size_t n = 0;
+            size_t ndel = 0;
+            size_t l = ptls2->heap.weak_refs.len;
+            void **lst = ptls2->heap.weak_refs.items;
+            if (l == 0)
+                continue;
+            while (1) {
+                jl_weakref_t *wr = (jl_weakref_t*)lst[n];
+                if (gc_marked(jl_astaggedvalue(wr)->bits.gc))
+                    n++;
+                else
+                    ndel++;
+                if (n >= l - ndel)
+                    break;
+                void *tmp = lst[n];
+                lst[n] = lst[n + ndel];
+                lst[n + ndel] = tmp;
+            }
+            ptls2->heap.weak_refs.len -= ndel;
         }
-        ptls2->heap.weak_refs.len -= ndel;
     }
 }
 
@@ -1150,7 +1139,7 @@ static void sweep_weak_refs(void)
 // big value list
 
 // Size includes the tag and the tag is not cleared!!
-static inline jl_value_t *jl_gc_big_alloc_inner(jl_ptls_t ptls, size_t sz)
+STATIC_INLINE jl_value_t *jl_gc_big_alloc_inner(jl_ptls_t ptls, size_t sz)
 {
     maybe_collect(ptls);
     size_t offs = offsetof(bigval_t, header);
@@ -1182,7 +1171,6 @@ static inline jl_value_t *jl_gc_big_alloc_inner(jl_ptls_t ptls, size_t sz)
 JL_DLLEXPORT jl_value_t *jl_gc_big_alloc(jl_ptls_t ptls, size_t sz)
 {
     jl_value_t *val = jl_gc_big_alloc_inner(ptls, sz);
-
     maybe_record_alloc_to_profile(val, sz, jl_gc_unknown_type_tag);
     return val;
 }
@@ -1250,9 +1238,8 @@ static void sweep_big(jl_ptls_t ptls, int sweep_full) JL_NOTSAFEPOINT
     assert(gc_n_threads);
     for (int i = 0; i < gc_n_threads; i++) {
         jl_ptls_t ptls2 = gc_all_tls_states[i];
-        if (ptls2 == NULL)
-            continue;
-        sweep_big_list(sweep_full, &ptls2->heap.big_objects);
+        if (ptls2 != NULL)
+            sweep_big_list(sweep_full, &ptls2->heap.big_objects);
     }
     if (sweep_full) {
         bigval_t **last_next = sweep_big_list(sweep_full, &big_objects_marked);
@@ -1321,7 +1308,7 @@ static void reset_thread_gc_counts(void) JL_NOTSAFEPOINT
     gc_all_tls_states = jl_atomic_load_relaxed(&jl_all_tls_states);
     for (int i = 0; i < gc_n_threads; i++) {
         jl_ptls_t ptls = gc_all_tls_states[i];
-        if (ptls) {
+        if (ptls != NULL) {
             memset(&ptls->gc_num, 0, sizeof(ptls->gc_num));
             jl_atomic_store_relaxed(&ptls->gc_num.allocd, -(int64_t)gc_num.interval);
         }
@@ -1370,32 +1357,32 @@ static void sweep_malloced_arrays(void) JL_NOTSAFEPOINT
     assert(gc_n_threads);
     for (int t_i = 0; t_i < gc_n_threads; t_i++) {
         jl_ptls_t ptls2 = gc_all_tls_states[t_i];
-        if (ptls2 == NULL)
-            continue;
-        mallocarray_t *ma = ptls2->heap.mallocarrays;
-        mallocarray_t **pma = &ptls2->heap.mallocarrays;
-        while (ma != NULL) {
-            mallocarray_t *nxt = ma->next;
-            int bits = jl_astaggedvalue(ma->a)->bits.gc;
-            if (gc_marked(bits)) {
-                pma = &ma->next;
-            }
-            else {
-                *pma = nxt;
-                assert(ma->a->flags.how == 2);
-                jl_gc_free_array(ma->a);
-                ma->next = ptls2->heap.mafreelist;
-                ptls2->heap.mafreelist = ma;
+        if (ptls2 != NULL) {
+            mallocarray_t *ma = ptls2->heap.mallocarrays;
+            mallocarray_t **pma = &ptls2->heap.mallocarrays;
+            while (ma != NULL) {
+                mallocarray_t *nxt = ma->next;
+                int bits = jl_astaggedvalue(ma->a)->bits.gc;
+                if (gc_marked(bits)) {
+                    pma = &ma->next;
+                }
+                else {
+                    *pma = nxt;
+                    assert(ma->a->flags.how == 2);
+                    jl_gc_free_array(ma->a);
+                    ma->next = ptls2->heap.mafreelist;
+                    ptls2->heap.mafreelist = ma;
+                }
+                gc_time_count_mallocd_array(bits);
+                ma = nxt;
             }
-            gc_time_count_mallocd_array(bits);
-            ma = nxt;
         }
     }
     gc_time_mallocd_array_end();
 }
 
 // pool allocation
-static inline jl_taggedvalue_t *reset_page(jl_ptls_t ptls2, const jl_gc_pool_t *p, jl_gc_pagemeta_t *pg, jl_taggedvalue_t *fl) JL_NOTSAFEPOINT
+STATIC_INLINE jl_taggedvalue_t *reset_page(jl_ptls_t ptls2, const jl_gc_pool_t *p, jl_gc_pagemeta_t *pg, jl_taggedvalue_t *fl) JL_NOTSAFEPOINT
 {
     assert(GC_PAGE_OFFSET >= sizeof(void*));
     pg->nfree = (GC_PAGE_SZ - GC_PAGE_OFFSET) / p->osize;
@@ -1444,7 +1431,7 @@ static NOINLINE jl_taggedvalue_t *add_page(jl_gc_pool_t *p) JL_NOTSAFEPOINT
 }
 
 // Size includes the tag and the tag is not cleared!!
-static inline jl_value_t *jl_gc_pool_alloc_inner(jl_ptls_t ptls, int pool_offset,
+STATIC_INLINE jl_value_t *jl_gc_pool_alloc_inner(jl_ptls_t ptls, int pool_offset,
                                           int osize)
 {
     // Use the pool offset instead of the pool address as the argument
@@ -1462,7 +1449,7 @@ static inline jl_value_t *jl_gc_pool_alloc_inner(jl_ptls_t ptls, int pool_offset
         jl_atomic_load_relaxed(&ptls->gc_num.poolalloc) + 1);
     // first try to use the freelist
     jl_taggedvalue_t *v = p->freelist;
-    if (v) {
+    if (v != NULL) {
         jl_taggedvalue_t *next = v->next;
         p->freelist = next;
         if (__unlikely(gc_page_data(v) != gc_page_data(next))) {
@@ -1482,8 +1469,8 @@ static inline jl_value_t *jl_gc_pool_alloc_inner(jl_ptls_t ptls, int pool_offset
     // If there's no pages left or the current page is used up,
     // we need to use the slow path.
     char *cur_page = gc_page_data((char*)v - 1);
-    if (__unlikely(!v || cur_page + GC_PAGE_SZ < (char*)next)) {
-        if (v) {
+    if (__unlikely(v == NULL || cur_page + GC_PAGE_SZ < (char*)next)) {
+        if (v != NULL) {
             // like the freelist case,
             // but only update the page metadata when it is full
             jl_gc_pagemeta_t *pg = jl_assume(page_metadata((char*)v - 1));
@@ -1493,7 +1480,7 @@ static inline jl_value_t *jl_gc_pool_alloc_inner(jl_ptls_t ptls, int pool_offset
             v = *(jl_taggedvalue_t**)cur_page;
         }
         // Not an else!!
-        if (!v)
+        if (v == NULL)
             v = add_page(p);
         next = (jl_taggedvalue_t*)((char*)v + osize);
     }
@@ -1507,7 +1494,6 @@ JL_DLLEXPORT jl_value_t *jl_gc_pool_alloc(jl_ptls_t ptls, int pool_offset,
                                           int osize)
 {
     jl_value_t *val = jl_gc_pool_alloc_inner(ptls, pool_offset, osize);
-
     maybe_record_alloc_to_profile(val, osize, jl_gc_unknown_type_tag);
     return val;
 }
@@ -1660,7 +1646,7 @@ static jl_taggedvalue_t **sweep_page(jl_gc_pool_t *p, jl_gc_pagemeta_t *pg, jl_t
 }
 
 // the actual sweeping over all allocated pages in a memory pool
-static inline void sweep_pool_page(jl_taggedvalue_t ***pfl, jl_gc_pagemeta_t *pg, int sweep_full) JL_NOTSAFEPOINT
+STATIC_INLINE void sweep_pool_page(jl_taggedvalue_t ***pfl, jl_gc_pagemeta_t *pg, int sweep_full) JL_NOTSAFEPOINT
 {
     int p_n = pg->pool_n;
     int t_n = pg->thread_n;
@@ -1671,7 +1657,7 @@ static inline void sweep_pool_page(jl_taggedvalue_t ***pfl, jl_gc_pagemeta_t *pg
 }
 
 // sweep over a pagetable0 for all allocated pages
-static inline int sweep_pool_pagetable0(jl_taggedvalue_t ***pfl, pagetable0_t *pagetable0, int sweep_full) JL_NOTSAFEPOINT
+STATIC_INLINE int sweep_pool_pagetable0(jl_taggedvalue_t ***pfl, pagetable0_t *pagetable0, int sweep_full) JL_NOTSAFEPOINT
 {
     unsigned ub = 0;
     unsigned alloc = 0;
@@ -1695,7 +1681,7 @@ static inline int sweep_pool_pagetable0(jl_taggedvalue_t ***pfl, pagetable0_t *p
 }
 
 // sweep over pagetable1 for all pagetable0 that may contain allocated pages
-static inline int sweep_pool_pagetable1(jl_taggedvalue_t ***pfl, pagetable1_t *pagetable1, int sweep_full) JL_NOTSAFEPOINT
+STATIC_INLINE int sweep_pool_pagetable1(jl_taggedvalue_t ***pfl, pagetable1_t *pagetable1, int sweep_full) JL_NOTSAFEPOINT
 {
     unsigned ub = 0;
     unsigned alloc = 0;
@@ -1724,7 +1710,7 @@ static void sweep_pool_pagetable(jl_taggedvalue_t ***pfl, int sweep_full) JL_NOT
 {
     if (REGION2_PG_COUNT == 1) { // compile-time optimization
         pagetable1_t *pagetable1 = memory_map.meta1[0];
-        if (pagetable1)
+        if (pagetable1 != NULL)
             sweep_pool_pagetable1(pfl, pagetable1, sweep_full);
         return;
     }
@@ -1823,10 +1809,10 @@ static void gc_sweep_pool(int sweep_full)
     // null out terminal pointers of free lists
     for (int t_i = 0; t_i < n_threads; t_i++) {
         jl_ptls_t ptls2 = gc_all_tls_states[t_i];
-        if (ptls2 == NULL)
-            continue;
-        for (int i = 0; i < JL_GC_N_POOLS; i++) {
-            *pfl[t_i * JL_GC_N_POOLS + i] = NULL;
+        if (ptls2 != NULL) {
+            for (int i = 0; i < JL_GC_N_POOLS; i++) {
+                *pfl[t_i * JL_GC_N_POOLS + i] = NULL;
+            }
         }
     }
 
@@ -1910,7 +1896,7 @@ static void *volatile gc_findval; // for usage from gdb, for finding the gc-root
 
 // Handle the case where the stack is only partially copied.
 STATIC_INLINE uintptr_t gc_get_stack_addr(void *_addr, uintptr_t offset,
-                                          uintptr_t lb, uintptr_t ub)
+                                          uintptr_t lb, uintptr_t ub) JL_NOTSAFEPOINT
 {
     uintptr_t addr = (uintptr_t)_addr;
     if (addr >= lb && addr < ub)
@@ -1919,929 +1905,650 @@ STATIC_INLINE uintptr_t gc_get_stack_addr(void *_addr, uintptr_t offset,
 }
 
 STATIC_INLINE uintptr_t gc_read_stack(void *_addr, uintptr_t offset,
-                                      uintptr_t lb, uintptr_t ub)
+                                      uintptr_t lb, uintptr_t ub) JL_NOTSAFEPOINT
 {
     uintptr_t real_addr = gc_get_stack_addr(_addr, offset, lb, ub);
     return *(uintptr_t*)real_addr;
 }
 
 JL_NORETURN NOINLINE void gc_assert_datatype_fail(jl_ptls_t ptls, jl_datatype_t *vt,
-                                                  jl_gc_mark_sp_t sp)
+                                                  jl_gc_markqueue_t *mq) JL_NOTSAFEPOINT
 {
     jl_safe_printf("GC error (probable corruption) :\n");
     jl_gc_debug_print_status();
     jl_(vt);
     jl_gc_debug_critical_error();
-    gc_mark_loop_unwind(ptls, sp, 0);
     abort();
 }
 
-// This stores the label address in the mark loop function.
-// We can't directly store that to a global array so we need some hack to get that.
-// See the call to `gc_mark_loop` in init with a `NULL` `ptls`.
-void *gc_mark_label_addrs[_GC_MARK_L_MAX];
-
-// Double the local mark stack (both pc and data)
-static void NOINLINE gc_mark_stack_resize(jl_gc_mark_cache_t *gc_cache, jl_gc_mark_sp_t *sp) JL_NOTSAFEPOINT
-{
-    jl_gc_mark_data_t *old_data = gc_cache->data_stack;
-    void **pc_stack = sp->pc_start;
-    size_t stack_size = (char*)sp->pc_end - (char*)pc_stack;
-    ptrdiff_t datadiff = (char*)sp->data - (char*)old_data;
-    gc_cache->data_stack = (jl_gc_mark_data_t *)realloc_s(old_data, stack_size * 2 * sizeof(jl_gc_mark_data_t));
-    sp->data = (jl_gc_mark_data_t *)((char*)gc_cache->data_stack + datadiff);
-
-    sp->pc_start = gc_cache->pc_stack = (void**)realloc_s(pc_stack, stack_size * 2 * sizeof(void*));
-    gc_cache->pc_stack_end = sp->pc_end = sp->pc_start + stack_size * 2;
-    sp->pc = sp->pc_start + (sp->pc - pc_stack);
-}
-
-// Push a work item to the stack. The type of the work item is marked with `pc`.
-// The data needed is in `data` and is of size `data_size`.
-// If there isn't enough space on the stack, the stack will be resized with the stack
-// lock held. The caller should invalidate any local cache of the stack addresses that's not
-// in `gc_cache` or `sp`
-// The `sp` will be updated on return if `inc` is true.
-STATIC_INLINE void gc_mark_stack_push(jl_gc_mark_cache_t *gc_cache, jl_gc_mark_sp_t *sp,
-                                      void *pc, void *data, size_t data_size, int inc) JL_NOTSAFEPOINT
+// Check if `nptr` is tagged for `old + refyoung`,
+// Push the object to the remset and update the `nptr` counter if necessary.
+STATIC_INLINE void gc_mark_push_remset(jl_ptls_t ptls, jl_value_t *obj,
+                                       uintptr_t nptr) JL_NOTSAFEPOINT
 {
-    assert(data_size <= sizeof(jl_gc_mark_data_t));
-    if (__unlikely(sp->pc == sp->pc_end))
-        gc_mark_stack_resize(gc_cache, sp);
-    *sp->pc = pc;
-    memcpy(sp->data, data, data_size);
-    if (inc) {
-        sp->data = (jl_gc_mark_data_t *)(((char*)sp->data) + data_size);
-        sp->pc++;
+    if (__unlikely((nptr & 0x3) == 0x3)) {
+        ptls->heap.remset_nptr += nptr >> 2;
+        arraylist_t *remset = ptls->heap.remset;
+        size_t len = remset->len;
+        if (__unlikely(len >= remset->max)) {
+            arraylist_push(remset, obj);
+        }
+        else {
+            remset->len = len + 1;
+            remset->items[len] = obj;
+        }
     }
 }
 
-// Check if the reference is non-NULL and atomically set the mark bit.
-// Update `*nptr`, which is the `nptr` field of the parent item, if the object is young.
-// Return the tag (with GC bits cleared) and the GC bits in `*ptag` and `*pbits`.
-// Return whether the object needs to be scanned / have metadata updated.
-STATIC_INLINE int gc_try_setmark(jl_value_t *obj, uintptr_t *nptr,
-                                 uintptr_t *ptag, uint8_t *pbits) JL_NOTSAFEPOINT
+// Push a work item to the queue
+STATIC_INLINE void gc_ptr_queue_push(jl_gc_markqueue_t *mq, jl_value_t *obj) JL_NOTSAFEPOINT
 {
-    if (!obj)
-        return 0;
-    jl_taggedvalue_t *o = jl_astaggedvalue(obj);
-    uintptr_t tag = o->header;
-    if (!gc_marked(tag)) {
-        uint8_t bits;
-        int res = gc_setmark_tag(o, GC_MARKED, tag, &bits);
-        if (!gc_old(bits))
-            *nptr = *nptr | 1;
-        *ptag = tag & ~(uintptr_t)0xf;
-        *pbits = bits;
-        return __likely(res);
-    }
-    else if (!gc_old(tag)) {
-        *nptr = *nptr | 1;
-    }
-    return 0;
+    ws_array_t *old_a = ws_queue_push(&mq->ptr_queue, &obj, sizeof(jl_value_t*));
+    // Put `old_a` in `reclaim_set` to be freed after the mark phase
+    if (__unlikely(old_a != NULL))
+        arraylist_push(&mq->reclaim_set, old_a);
 }
 
-// Queue a finalizer list to be scanned in the mark loop. Start marking from index `start`.
-void gc_mark_queue_finlist(jl_gc_mark_cache_t *gc_cache, jl_gc_mark_sp_t *sp,
-                           arraylist_t *list, size_t start)
+// Pop from the mark queue
+STATIC_INLINE jl_value_t *gc_ptr_queue_pop(jl_gc_markqueue_t *mq) JL_NOTSAFEPOINT
 {
-    size_t len = list->len;
-    if (len <= start)
-        return;
-    jl_value_t **items = (jl_value_t**)list->items;
-    gc_mark_finlist_t markdata = {items + start, items + len};
-    gc_mark_stack_push(gc_cache, sp, gc_mark_label_addrs[GC_MARK_L_finlist],
-                       &markdata, sizeof(markdata), 1);
+    jl_value_t *v = NULL;
+    ws_queue_pop(&mq->ptr_queue, &v, sizeof(jl_value_t*));
+    return v;
 }
 
-// Queue a object to be scanned. The object should already be marked and the GC metadata
-// should already be updated for it. Only scanning of the object should be performed.
-STATIC_INLINE void gc_mark_queue_scan_obj(jl_gc_mark_cache_t *gc_cache, jl_gc_mark_sp_t *sp,
-                                          jl_value_t *obj)
+// Steal from `mq2`
+STATIC_INLINE jl_value_t *gc_ptr_queue_steal_from(jl_gc_markqueue_t *mq2) JL_NOTSAFEPOINT
 {
-    jl_taggedvalue_t *o = jl_astaggedvalue(obj);
-    uintptr_t tag = o->header;
-    uint8_t bits = tag & 0xf;
-    tag = tag & ~(uintptr_t)0xf;
-    gc_mark_marked_obj_t data = {obj, tag, bits};
-    gc_mark_stack_push(gc_cache, sp, gc_mark_label_addrs[GC_MARK_L_scan_only],
-                       &data, sizeof(data), 1);
+    jl_value_t *v = NULL;
+    ws_queue_steal_from(&mq2->ptr_queue, &v, sizeof(jl_value_t*));
+    return v;
 }
 
-// Mark and queue a object to be scanned.
-// The object will be marked atomically which can also happen concurrently.
-// It will be queued if the object wasn't marked already (or concurrently by another thread)
-// Returns whether the object is young.
-STATIC_INLINE int gc_mark_queue_obj(jl_gc_mark_cache_t *gc_cache, jl_gc_mark_sp_t *sp, void *_obj) JL_NOTSAFEPOINT
+// Push chunk `*c` into chunk queue
+STATIC_INLINE void gc_chunkqueue_push(jl_gc_markqueue_t *mq, jl_gc_chunk_t *c) JL_NOTSAFEPOINT
 {
-    jl_value_t *obj = (jl_value_t*)jl_assume(_obj);
-    uintptr_t nptr = 0;
-    uintptr_t tag = 0;
-    uint8_t bits = 0;
-    if (!gc_try_setmark(obj, &nptr, &tag, &bits))
-        return (int)nptr;
-    gc_mark_marked_obj_t data = {obj, tag, bits};
-    gc_mark_stack_push(gc_cache, sp, gc_mark_label_addrs[GC_MARK_L_marked_obj],
-                       &data, sizeof(data), 1);
-    return (int)nptr;
+    ws_array_t *old_a = ws_queue_push(&mq->chunk_queue, c, sizeof(jl_gc_chunk_t));
+    // Put `old_a` in `reclaim_set` to be freed after the mark phase
+    if (__unlikely(old_a != NULL))
+        arraylist_push(&mq->reclaim_set, old_a);
 }
 
-int jl_gc_mark_queue_obj_explicit(jl_gc_mark_cache_t *gc_cache, jl_gc_mark_sp_t *sp, jl_value_t *obj)
+// Pop chunk from chunk queue
+STATIC_INLINE jl_gc_chunk_t gc_chunkqueue_pop(jl_gc_markqueue_t *mq) JL_NOTSAFEPOINT
 {
-    return gc_mark_queue_obj(gc_cache, sp, obj);
+    jl_gc_chunk_t c = {.cid = GC_empty_chunk};
+    ws_queue_pop(&mq->chunk_queue, &c, sizeof(jl_gc_chunk_t));
+    return c;
 }
 
-JL_DLLEXPORT int jl_gc_mark_queue_obj(jl_ptls_t ptls, jl_value_t *obj)
+// Steal chunk from `mq2`
+STATIC_INLINE jl_gc_chunk_t gc_chunkqueue_steal_from(jl_gc_markqueue_t *mq2) JL_NOTSAFEPOINT
 {
-    return gc_mark_queue_obj(&ptls->gc_cache, &ptls->gc_mark_sp, obj);
+    jl_gc_chunk_t c = {.cid = GC_empty_chunk};
+    ws_queue_steal_from(&mq2->chunk_queue, &c, sizeof(jl_gc_chunk_t));
+    return c;
 }
 
-JL_DLLEXPORT void jl_gc_mark_queue_objarray(jl_ptls_t ptls, jl_value_t *parent,
-                                            jl_value_t **objs, size_t nobjs)
+// Enqueue an unmarked obj. last bit of `nptr` is set if `_obj` is young
+STATIC_INLINE void gc_try_claim_and_push(jl_gc_markqueue_t *mq, void *_obj,
+                           uintptr_t *nptr) JL_NOTSAFEPOINT
 {
-    gc_mark_objarray_t data = { parent, objs, objs + nobjs, 1,
-                                jl_astaggedvalue(parent)->bits.gc & 2 };
-    gc_mark_stack_push(&ptls->gc_cache, &ptls->gc_mark_sp,
-                       gc_mark_label_addrs[GC_MARK_L_objarray],
-                       &data, sizeof(data), 1);
+    if (_obj == NULL)
+        return;
+    jl_value_t *obj = (jl_value_t *)jl_assume(_obj);
+    jl_taggedvalue_t *o = jl_astaggedvalue(obj);
+    if (!gc_old(o->header) && nptr)
+        *nptr |= 1;
+    if (gc_try_setmark_tag(o, GC_MARKED))
+        gc_ptr_queue_push(mq, obj);
 }
 
-
-// Check if `nptr` is tagged for `old + refyoung`,
-// Push the object to the remset and update the `nptr` counter if necessary.
-STATIC_INLINE void gc_mark_push_remset(jl_ptls_t ptls, jl_value_t *obj, uintptr_t nptr) JL_NOTSAFEPOINT
+// Mark object with 8bit field descriptors
+STATIC_INLINE jl_value_t *gc_mark_obj8(jl_ptls_t ptls, char *obj8_parent, uint8_t *obj8_begin,
+                         uint8_t *obj8_end, uintptr_t nptr) JL_NOTSAFEPOINT
 {
-    if (__unlikely((nptr & 0x3) == 0x3)) {
-        ptls->heap.remset_nptr += nptr >> 2;
-        arraylist_t *remset = ptls->heap.remset;
-        size_t len = remset->len;
-        if (__unlikely(len >= remset->max)) {
-            arraylist_push(remset, obj);
-        }
-        else {
-            remset->len = len + 1;
-            remset->items[len] = obj;
+    (void)jl_assume(obj8_begin < obj8_end);
+    jl_gc_markqueue_t *mq = &ptls->mark_queue;
+    jl_value_t **slot = NULL;
+    jl_value_t *new_obj = NULL;
+    for (; obj8_begin < obj8_end; obj8_begin++) {
+        slot = &((jl_value_t**)obj8_parent)[*obj8_begin];
+        new_obj = *slot;
+        if (new_obj != NULL) {
+            verify_parent2("object", obj8_parent, slot, "field(%d)",
+                            gc_slot_to_fieldidx(obj8_parent, slot, (jl_datatype_t*)jl_typeof(obj8_parent)));
+            if (obj8_begin + 1 != obj8_end) {
+                gc_try_claim_and_push(mq, new_obj, &nptr);
+            }
+            else {
+                // Unroll marking of last item to avoid pushing
+                // and popping it right away
+                jl_taggedvalue_t *o = jl_astaggedvalue(new_obj);
+                nptr |= !gc_old(o->header);
+                if (!gc_try_setmark_tag(o, GC_MARKED)) new_obj = NULL;
+            }
+            gc_heap_snapshot_record_object_edge((jl_value_t*)obj8_parent, slot);
         }
     }
+    gc_mark_push_remset(ptls, (jl_value_t *)obj8_parent, nptr);
+    return new_obj;
 }
 
-// Scan a dense array of object references, see `gc_mark_objarray_t`
-STATIC_INLINE int gc_mark_scan_objarray(jl_ptls_t ptls, jl_gc_mark_sp_t *sp,
-                                        gc_mark_objarray_t *objary,
-                                        jl_value_t **begin, jl_value_t **end,
-                                        jl_value_t **pnew_obj, uintptr_t *ptag, uint8_t *pbits)
+// Mark object with 16bit field descriptors
+STATIC_INLINE jl_value_t *gc_mark_obj16(jl_ptls_t ptls, char *obj16_parent, uint16_t *obj16_begin,
+                          uint16_t *obj16_end, uintptr_t nptr) JL_NOTSAFEPOINT
 {
-    (void)jl_assume(objary == (gc_mark_objarray_t*)sp->data);
-    for (; begin < end; begin += objary->step) {
-        *pnew_obj = *begin;
-        if (*pnew_obj) {
-            verify_parent2("obj array", objary->parent, begin, "elem(%d)",
-                           gc_slot_to_arrayidx(objary->parent, begin));
-            gc_heap_snapshot_record_array_edge(objary->parent, begin);
-        }
-        if (!gc_try_setmark(*pnew_obj, &objary->nptr, ptag, pbits))
-            continue;
-        begin += objary->step;
-        // Found an object to mark
-        if (begin < end) {
-            // Haven't done with this one yet. Update the content and push it back
-            objary->begin = begin;
-            gc_repush_markdata(sp, gc_mark_objarray_t);
-        }
-        else {
-            // Finished scanning this one, finish up by checking the GC invariance
-            // and let the next item replacing the current one directly.
-            gc_mark_push_remset(ptls, objary->parent, objary->nptr);
-        }
-        return 1;
-    }
-    gc_mark_push_remset(ptls, objary->parent, objary->nptr);
-    return 0;
-}
-
-// Scan a sparse array of object references, see `gc_mark_objarray_t`
-STATIC_INLINE int gc_mark_scan_array8(jl_ptls_t ptls, jl_gc_mark_sp_t *sp,
-                                      gc_mark_array8_t *ary8,
-                                      jl_value_t **begin, jl_value_t **end,
-                                      uint8_t *elem_begin, uint8_t *elem_end,
-                                      jl_value_t **pnew_obj, uintptr_t *ptag, uint8_t *pbits)
-{
-    (void)jl_assume(ary8 == (gc_mark_array8_t*)sp->data);
-    size_t elsize = ((jl_array_t*)ary8->elem.parent)->elsize / sizeof(jl_value_t*);
-    for (; begin < end; begin += elsize) {
-        for (; elem_begin < elem_end; elem_begin++) {
-            jl_value_t **slot = &begin[*elem_begin];
-            *pnew_obj = *slot;
-            if (*pnew_obj) {
-                verify_parent2("array", ary8->elem.parent, slot, "elem(%d)",
-                               gc_slot_to_arrayidx(ary8->elem.parent, begin));
-                gc_heap_snapshot_record_array_edge(ary8->elem.parent, slot);
-            }
-            if (!gc_try_setmark(*pnew_obj, &ary8->elem.nptr, ptag, pbits))
-                continue;
-            elem_begin++;
-            // Found an object to mark
-            if (elem_begin < elem_end) {
-                // Haven't done with this one yet. Update the content and push it back
-                ary8->elem.begin = elem_begin;
-                ary8->begin = begin;
-                gc_repush_markdata(sp, gc_mark_array8_t);
+    (void)jl_assume(obj16_begin < obj16_end);
+    jl_gc_markqueue_t *mq = &ptls->mark_queue;
+    jl_value_t **slot = NULL;
+    jl_value_t *new_obj = NULL;
+    for (; obj16_begin < obj16_end; obj16_begin++) {
+        slot = &((jl_value_t **)obj16_parent)[*obj16_begin];
+        new_obj = *slot;
+        if (new_obj != NULL) {
+            verify_parent2("object", obj16_parent, slot, "field(%d)",
+                            gc_slot_to_fieldidx(obj16_parent, slot, (jl_datatype_t*)jl_typeof(obj16_parent)));
+            gc_try_claim_and_push(mq, new_obj, &nptr);
+            if (obj16_begin + 1 != obj16_end) {
+                gc_try_claim_and_push(mq, new_obj, &nptr);
             }
             else {
-                begin += elsize;
-                if (begin < end) {
-                    // Haven't done with this array yet. Reset the content and push it back
-                    ary8->elem.begin = ary8->rebegin;
-                    ary8->begin = begin;
-                    gc_repush_markdata(sp, gc_mark_array8_t);
-                }
-                else {
-                    // Finished scanning this one, finish up by checking the GC invariance
-                    // and let the next item replacing the current one directly.
-                    gc_mark_push_remset(ptls, ary8->elem.parent, ary8->elem.nptr);
-                }
+                // Unroll marking of last item to avoid pushing
+                // and popping it right away
+                jl_taggedvalue_t *o = jl_astaggedvalue(new_obj);
+                nptr |= !gc_old(o->header);
+                if (!gc_try_setmark_tag(o, GC_MARKED)) new_obj = NULL;
             }
-            return 1;
+            gc_heap_snapshot_record_object_edge((jl_value_t*)obj16_parent, slot);
         }
-        elem_begin = ary8->rebegin;
-    }
-    gc_mark_push_remset(ptls, ary8->elem.parent, ary8->elem.nptr);
-    return 0;
-}
-
-// Scan a sparse array of object references, see `gc_mark_objarray_t`
-STATIC_INLINE int gc_mark_scan_array16(jl_ptls_t ptls, jl_gc_mark_sp_t *sp,
-                                      gc_mark_array16_t *ary16,
-                                      jl_value_t **begin, jl_value_t **end,
-                                      uint16_t *elem_begin, uint16_t *elem_end,
-                                      jl_value_t **pnew_obj, uintptr_t *ptag, uint8_t *pbits)
-{
-    (void)jl_assume(ary16 == (gc_mark_array16_t*)sp->data);
-    size_t elsize = ((jl_array_t*)ary16->elem.parent)->elsize / sizeof(jl_value_t*);
-    for (; begin < end; begin += elsize) {
-        for (; elem_begin < elem_end; elem_begin++) {
-            jl_value_t **slot = &begin[*elem_begin];
-            *pnew_obj = *slot;
-            if (*pnew_obj) {
-                verify_parent2("array", ary16->elem.parent, slot, "elem(%d)",
-                               gc_slot_to_arrayidx(ary16->elem.parent, begin));
-                gc_heap_snapshot_record_array_edge(ary16->elem.parent, slot);
-            }
-            if (!gc_try_setmark(*pnew_obj, &ary16->elem.nptr, ptag, pbits))
-                continue;
-            elem_begin++;
-            // Found an object to mark
-            if (elem_begin < elem_end) {
-                // Haven't done with this one yet. Update the content and push it back
-                ary16->elem.begin = elem_begin;
-                ary16->begin = begin;
-                gc_repush_markdata(sp, gc_mark_array16_t);
+    }
+    gc_mark_push_remset(ptls, (jl_value_t *)obj16_parent, nptr);
+    return new_obj;
+}
+
+// Mark object with 32bit field descriptors
+STATIC_INLINE jl_value_t *gc_mark_obj32(jl_ptls_t ptls, char *obj32_parent, uint32_t *obj32_begin,
+                          uint32_t *obj32_end, uintptr_t nptr) JL_NOTSAFEPOINT
+{
+    (void)jl_assume(obj32_begin < obj32_end);
+    jl_gc_markqueue_t *mq = &ptls->mark_queue;
+    jl_value_t **slot = NULL;
+    jl_value_t *new_obj = NULL;
+    for (; obj32_begin < obj32_end; obj32_begin++) {
+        slot = &((jl_value_t **)obj32_parent)[*obj32_begin];
+        new_obj = *slot;
+        if (new_obj != NULL) {
+            verify_parent2("object", obj32_parent, slot, "field(%d)",
+                            gc_slot_to_fieldidx(obj32_parent, slot, (jl_datatype_t*)jl_typeof(obj32_parent)));
+            if (obj32_begin + 1 != obj32_end) {
+                gc_try_claim_and_push(mq, new_obj, &nptr);
             }
             else {
-                begin += elsize;
-                if (begin < end) {
-                    // Haven't done with this array yet. Reset the content and push it back
-                    ary16->elem.begin = ary16->rebegin;
-                    ary16->begin = begin;
-                    gc_repush_markdata(sp, gc_mark_array16_t);
-                }
-                else {
-                    // Finished scanning this one, finish up by checking the GC invariance
-                    // and let the next item replacing the current one directly.
-                    gc_mark_push_remset(ptls, ary16->elem.parent, ary16->elem.nptr);
-                }
+                // Unroll marking of last item to avoid pushing
+                // and popping it right away
+                jl_taggedvalue_t *o = jl_astaggedvalue(new_obj);
+                nptr |= !gc_old(o->header);
+                if (!gc_try_setmark_tag(o, GC_MARKED)) new_obj = NULL;
             }
-            return 1;
+            gc_heap_snapshot_record_object_edge((jl_value_t*)obj32_parent, slot);
         }
-        elem_begin = ary16->rebegin;
     }
-    gc_mark_push_remset(ptls, ary16->elem.parent, ary16->elem.nptr);
-    return 0;
-}
-
-
-// Scan an object with 8bits field descriptors. see `gc_mark_obj8_t`
-STATIC_INLINE int gc_mark_scan_obj8(jl_ptls_t ptls, jl_gc_mark_sp_t *sp, gc_mark_obj8_t *obj8,
-                                    char *parent, uint8_t *begin, uint8_t *end,
-                                    jl_value_t **pnew_obj, uintptr_t *ptag, uint8_t *pbits)
-{
-    (void)jl_assume(obj8 == (gc_mark_obj8_t*)sp->data);
-    (void)jl_assume(begin < end);
-    for (; begin < end; begin++) {
-        jl_value_t **slot = &((jl_value_t**)parent)[*begin];
-        *pnew_obj = *slot;
-        if (*pnew_obj) {
-            verify_parent2("object", parent, slot, "field(%d)",
-                           gc_slot_to_fieldidx(parent, slot, (jl_datatype_t*)jl_typeof(parent)));
-            gc_heap_snapshot_record_object_edge((jl_value_t*)parent, slot);
+    return new_obj;
+}
+
+// Mark object array
+STATIC_INLINE void gc_mark_objarray(jl_ptls_t ptls, jl_value_t *obj_parent, jl_value_t **obj_begin,
+                      jl_value_t **obj_end, uint32_t step, uintptr_t nptr) JL_NOTSAFEPOINT
+{
+    jl_gc_markqueue_t *mq = &ptls->mark_queue;
+    jl_value_t *new_obj;
+    // Decide whether need to chunk objary
+    (void)jl_assume(step > 0);
+    if ((nptr & 0x2) == 0x2) {
+        // pre-scan this object: most of this object should be old, so look for
+        // the first young object before starting this chunk
+        // (this also would be valid for young objects, but probably less beneficial)
+        for (; obj_begin < obj_end; obj_begin += step) {
+            new_obj = *obj_begin;
+            if (new_obj != NULL) {
+                verify_parent2("obj array", obj_parent, obj_begin, "elem(%d)",
+                               gc_slot_to_arrayidx(obj_parent, obj_begin));
+                jl_taggedvalue_t *o = jl_astaggedvalue(new_obj);
+                if (!gc_old(o->header))
+                    nptr |= 1;
+                if (!gc_marked(o->header))
+                    break;
+                gc_heap_snapshot_record_array_edge(obj_parent, &new_obj);
+            }
         }
-        if (!gc_try_setmark(*pnew_obj, &obj8->nptr, ptag, pbits))
-            continue;
-        begin++;
-        // Found an object to mark
-        if (begin < end) {
-            // Haven't done with this one yet. Update the content and push it back
-            obj8->begin = begin;
-            gc_repush_markdata(sp, gc_mark_obj8_t);
+    }
+    size_t too_big = (obj_end - obj_begin) / GC_CHUNK_BATCH_SIZE > step; // use this order of operations to avoid idiv
+    jl_value_t **scan_end = obj_end;
+    int pushed_chunk = 0;
+    if (too_big) {
+        scan_end = obj_begin + step * GC_CHUNK_BATCH_SIZE;
+        // case 1: array owner is young, so we won't need to scan through all its elements
+        // to know that we will never need to push it to the remset. it's fine
+        // to create a chunk with "incorrect" `nptr` and push it to the chunk-queue
+        // ASAP in order to expose as much parallelism as possible
+        // case 2: lowest two bits of `nptr` are already set to 0x3, so won't change after
+        // scanning the array elements
+        if ((nptr & 0x2) != 0x2 || (nptr & 0x3) == 0x3) {
+            jl_gc_chunk_t c = {GC_objary_chunk, obj_parent, scan_end, obj_end, NULL, NULL, step, nptr};
+            gc_chunkqueue_push(mq, &c);
+            pushed_chunk = 1;
         }
-        else {
-            // Finished scanning this one, finish up by checking the GC invariance
-            // and let the next item replacing the current one directly.
-            gc_mark_push_remset(ptls, obj8->parent, obj8->nptr);
+    }
+    for (; obj_begin < scan_end; obj_begin += step) {
+        new_obj = *obj_begin;
+        if (new_obj != NULL) {
+            verify_parent2("obj array", obj_parent, obj_begin, "elem(%d)",
+                        gc_slot_to_arrayidx(obj_parent, obj_begin));
+            gc_try_claim_and_push(mq, new_obj, &nptr);
+            gc_heap_snapshot_record_array_edge(obj_parent, &new_obj);
         }
-        return 1;
-    }
-    gc_mark_push_remset(ptls, obj8->parent, obj8->nptr);
-    return 0;
-}
-
-// Scan an object with 16bits field descriptors. see `gc_mark_obj16_t`
-STATIC_INLINE int gc_mark_scan_obj16(jl_ptls_t ptls, jl_gc_mark_sp_t *sp, gc_mark_obj16_t *obj16,
-                                     char *parent, uint16_t *begin, uint16_t *end,
-                                     jl_value_t **pnew_obj, uintptr_t *ptag, uint8_t *pbits) JL_NOTSAFEPOINT
-{
-    (void)jl_assume(obj16 == (gc_mark_obj16_t*)sp->data);
-    (void)jl_assume(begin < end);
-    for (; begin < end; begin++) {
-        jl_value_t **slot = &((jl_value_t**)parent)[*begin];
-        *pnew_obj = *slot;
-        if (*pnew_obj) {
-            verify_parent2("object", parent, slot, "field(%d)",
-                           gc_slot_to_fieldidx(parent, slot, (jl_datatype_t*)jl_typeof(parent)));
-            gc_heap_snapshot_record_object_edge((jl_value_t*)parent, slot);
+    }
+    if (too_big) {
+        if (!pushed_chunk) {
+            jl_gc_chunk_t c = {GC_objary_chunk, obj_parent, scan_end, obj_end, NULL, NULL, step, nptr};
+            gc_chunkqueue_push(mq, &c);
         }
-        if (!gc_try_setmark(*pnew_obj, &obj16->nptr, ptag, pbits))
-            continue;
-        begin++;
-        // Found an object to mark
-        if (begin < end) {
-            // Haven't done with this one yet. Update the content and push it back
-            obj16->begin = begin;
-            gc_repush_markdata(sp, gc_mark_obj16_t);
+    }
+    else {
+        gc_mark_push_remset(ptls, obj_parent, nptr);
+    }
+}
+
+// Mark array with 8bit field descriptors
+STATIC_INLINE void gc_mark_array8(jl_ptls_t ptls, jl_value_t *ary8_parent, jl_value_t **ary8_begin,
+                    jl_value_t **ary8_end, uint8_t *elem_begin, uint8_t *elem_end,
+                    uintptr_t nptr) JL_NOTSAFEPOINT
+{
+    jl_gc_markqueue_t *mq = &ptls->mark_queue;
+    jl_value_t *new_obj;
+    size_t elsize = ((jl_array_t *)ary8_parent)->elsize / sizeof(jl_value_t *);
+    assert(elsize > 0);
+    // Decide whether need to chunk objary
+    if ((nptr & 0x2) == 0x2) {
+        // pre-scan this object: most of this object should be old, so look for
+        // the first young object before starting this chunk
+        // (this also would be valid for young objects, but probably less beneficial)
+        for (; ary8_begin < ary8_end; ary8_begin += elsize) {
+            int early_end = 0;
+            for (uint8_t *pindex = elem_begin; pindex < elem_end; pindex++) {
+                new_obj = ary8_begin[*pindex];
+                if (new_obj != NULL) {
+                    verify_parent2("array", ary8_parent, &new_obj, "elem(%d)",
+                                gc_slot_to_arrayidx(ary8_parent, ary8_begin));
+                    jl_taggedvalue_t *o = jl_astaggedvalue(new_obj);
+                    if (!gc_old(o->header))
+                        nptr |= 1;
+                    if (!gc_marked(o->header)){
+                        early_end = 1;
+                        break;
+                    }
+                    gc_heap_snapshot_record_array_edge(ary8_parent, &new_obj);
+                }
+            }
+            if (early_end)
+                break;
         }
-        else {
-            // Finished scanning this one, finish up by checking the GC invariance
-            // and let the next item replacing the current one directly.
-            gc_mark_push_remset(ptls, obj16->parent, obj16->nptr);
+    }
+    size_t too_big = (ary8_end - ary8_begin) / GC_CHUNK_BATCH_SIZE > elsize; // use this order of operations to avoid idiv
+    jl_value_t **scan_end = ary8_end;
+    int pushed_chunk = 0;
+    if (too_big) {
+        scan_end = ary8_begin + elsize * GC_CHUNK_BATCH_SIZE;
+        // case 1: array owner is young, so we won't need to scan through all its elements
+        // to know that we will never need to push it to the remset. it's fine
+        // to create a chunk with "incorrect" `nptr` and push it to the chunk-queue
+        // ASAP in order to expose as much parallelism as possible
+        // case 2: lowest two bits of `nptr` are already set to 0x3, so won't change after
+        // scanning the array elements
+        if ((nptr & 0x2) != 0x2 || (nptr & 0x3) == 0x3) {
+            jl_gc_chunk_t c = {GC_ary8_chunk, ary8_parent, scan_end, ary8_end, elem_begin, elem_end, 0, nptr};
+            gc_chunkqueue_push(mq, &c);
+            pushed_chunk = 1;
         }
-        return 1;
-    }
-    gc_mark_push_remset(ptls, obj16->parent, obj16->nptr);
-    return 0;
-}
-
-// Scan an object with 32bits field descriptors. see `gc_mark_obj32_t`
-STATIC_INLINE int gc_mark_scan_obj32(jl_ptls_t ptls, jl_gc_mark_sp_t *sp, gc_mark_obj32_t *obj32,
-                                     char *parent, uint32_t *begin, uint32_t *end,
-                                     jl_value_t **pnew_obj, uintptr_t *ptag, uint8_t *pbits)
-{
-    (void)jl_assume(obj32 == (gc_mark_obj32_t*)sp->data);
-    (void)jl_assume(begin < end);
-    for (; begin < end; begin++) {
-        jl_value_t **slot = &((jl_value_t**)parent)[*begin];
-        *pnew_obj = *slot;
-        if (*pnew_obj) {
-            verify_parent2("object", parent, slot, "field(%d)",
-                           gc_slot_to_fieldidx(parent, slot, (jl_datatype_t*)jl_typeof(parent)));
-            gc_heap_snapshot_record_object_edge((jl_value_t*)parent, slot);
+    }
+    for (; ary8_begin < ary8_end; ary8_begin += elsize) {
+        for (uint8_t *pindex = elem_begin; pindex < elem_end; pindex++) {
+            new_obj = ary8_begin[*pindex];
+            if (new_obj != NULL) {
+                verify_parent2("array", ary8_parent, &new_obj, "elem(%d)",
+                               gc_slot_to_arrayidx(ary8_parent, ary8_begin));
+                gc_try_claim_and_push(mq, new_obj, &nptr);
+                gc_heap_snapshot_record_array_edge(ary8_parent, &new_obj);
+            }
         }
-        if (!gc_try_setmark(*pnew_obj, &obj32->nptr, ptag, pbits))
-            continue;
-        begin++;
-        // Found an object to mark
-        if (begin < end) {
-            // Haven't done with this one yet. Update the content and push it back
-            obj32->begin = begin;
-            gc_repush_markdata(sp, gc_mark_obj32_t);
+    }
+    if (too_big) {
+        if (!pushed_chunk) {
+            jl_gc_chunk_t c = {GC_ary8_chunk, ary8_parent, scan_end, ary8_end, elem_begin, elem_end, 0, nptr};
+            gc_chunkqueue_push(mq, &c);
         }
-        else {
-            // Finished scanning this one, finish up by checking the GC invariance
-            // and let the next item replacing the current one directly.
-            gc_mark_push_remset(ptls, obj32->parent, obj32->nptr);
+    }
+    else {
+        gc_mark_push_remset(ptls, ary8_parent, nptr);
+    }
+}
+
+// Mark array with 16bit field descriptors
+STATIC_INLINE void gc_mark_array16(jl_ptls_t ptls, jl_value_t *ary16_parent, jl_value_t **ary16_begin,
+                     jl_value_t **ary16_end, uint16_t *elem_begin, uint16_t *elem_end,
+                     uintptr_t nptr) JL_NOTSAFEPOINT
+{
+    jl_gc_markqueue_t *mq = &ptls->mark_queue;
+    jl_value_t *new_obj;
+    size_t elsize = ((jl_array_t *)ary16_parent)->elsize / sizeof(jl_value_t *);
+    // Decide whether need to chunk ary16
+    size_t too_big = (ary16_end - ary16_begin) / GC_CHUNK_BATCH_SIZE > elsize; // use this order of operations to avoid idiv
+    jl_value_t **scan_end = ary16_end;
+    int pushed_chunk = 0;
+    if (too_big) {
+        scan_end = ary16_begin + elsize * GC_CHUNK_BATCH_SIZE;
+        // case 1: array owner is young, so we won't need to scan through all its elements
+        // to know that we will never need to push it to the remset. it's fine
+        // to create a chunk with "incorrect" `nptr` and push it to the chunk-queue
+        // ASAP in order to expose as much parallelism as possible
+        // case 2: lowest two bits of `nptr` are already set to 0x3, so won't change after
+        // scanning the array elements
+        if ((nptr & 0x2) != 0x2 || (nptr & 0x3) == 0x3) {
+            jl_gc_chunk_t c = {GC_ary16_chunk, ary16_parent, scan_end, ary16_end, elem_begin, elem_end, elsize, nptr};
+            gc_chunkqueue_push(mq, &c);
+            pushed_chunk = 1;
         }
-        return 1;
     }
-    gc_mark_push_remset(ptls, obj32->parent, obj32->nptr);
-    return 0;
-}
-
-#if defined(__GNUC__) && !defined(_OS_EMSCRIPTEN_)
-#  define gc_mark_laddr(name) (&&name)
-#  define gc_mark_jmp(ptr) goto *(ptr)
-#else
-#define gc_mark_laddr(name) ((void*)(uintptr_t)GC_MARK_L_##name)
-#define gc_mark_jmp(ptr) do {                   \
-        switch ((int)(uintptr_t)ptr) {          \
-        case GC_MARK_L_marked_obj:              \
-            goto marked_obj;                    \
-        case GC_MARK_L_scan_only:               \
-            goto scan_only;                     \
-        case GC_MARK_L_finlist:                 \
-            goto finlist;                       \
-        case GC_MARK_L_objarray:                \
-            goto objarray;                      \
-        case GC_MARK_L_array8:                  \
-            goto array8;                        \
-        case GC_MARK_L_array16:                 \
-            goto array16;                       \
-        case GC_MARK_L_obj8:                    \
-            goto obj8;                          \
-        case GC_MARK_L_obj16:                   \
-            goto obj16;                         \
-        case GC_MARK_L_obj32:                   \
-            goto obj32;                         \
-        case GC_MARK_L_stack:                   \
-            goto stack;                         \
-        case GC_MARK_L_excstack:                \
-            goto excstack;                      \
-        case GC_MARK_L_module_binding:          \
-            goto module_binding;                \
-        default:                                \
-            abort();                            \
-        }                                       \
-    } while (0)
-#endif
-
-// This is the main marking loop.
-// It uses an iterative (mostly) Depth-first search (DFS) to mark all the objects.
-// Instead of using the native stack, two stacks are manually maintained,
-// one (fixed-size) pc stack which stores the return address and one (variable-size)
-// data stack which stores the local variables needed by the scanning code.
-// Using a manually maintained stack has a few advantages
-//
-// 1. We can resize the stack as we go and never worry about stack overflow
-//    This is especitally useful when enters the GC in a deep call stack.
-//    It also removes the very deep GC call stack in a profile.
-// 2. We can minimize the number of local variables to save on the stack.
-//    This includes minimizing the sizes of the stack frames and only saving variables
-//    that have been changed before making "function calls" (i.e. `goto mark;`)
-// 3. We can perform end-of-loop tail-call optimization for common cases.
-// 4. The marking can be interrupted more easily since all the states are maintained
-//    in a well-defined format already.
-//    This will be useful if we want to have incremental marking again.
-// 5. The frames can be stolen by another thread more easily and it is not necessary
-//    to copy works to be stolen to another queue. Useful for parallel marking.
-//    (Will still require synchronization in stack popping of course.)
-// 6. A flat function (i.e. no or very few function calls) also give the compiler
-//    opportunity to keep more states in registers that doesn't have to be spilled as often.
-//
-// We use two stacks so that the thief on another thread can steal the fixed sized pc stack
-// and use that to figure out the size of the struct on the variable size data stack.
-//
-// The main disadvantages are that we bypass some stack-based CPU optimizations including the
-// stack engine and return address prediction.
-// Using two stacks also double the number of operations on the stack pointer
-// though we still only need to use one of them (the pc stack pointer) for bounds check.
-// In general, it seems that the reduction of stack memory ops and instructions count
-// have a larger positive effect on the performance. =)
-
-// As a general guide we do not want to make non-inlined function calls in this function
-// if possible since a large number of registers has to be spilled when that happens.
-// This is especially true on on X86 which doesn't have many (any?)
-// callee saved general purpose registers.
-// (OTOH, the spill will likely make use of the stack engine which is otherwise idle so
-//  the performance impact is minimum as long as it's not in the hottest path)
-
-// There are three external entry points to the loop, corresponding to label
-// `marked_obj`, `scan_only` and `finlist` (see the corresponding functions
-// `gc_mark_queue_obj`, `gc_mark_queue_scan_obj` and `gc_mark_queue_finlist` above).
-// The scanning of the object starts with `goto mark`, which updates the metadata and scans
-// the object whose information is stored in `new_obj`, `tag` and `bits`.
-// The branches in `mark` will dispatch the object to one of the scan "loop"s to be scanned
-// as either a normal julia object or one of the special objects with specific storage format.
-// Each of the scan "loop" will perform a DFS of the object in the following way
-//
-// 1. When encountering an pointer (julia object reference) slots, load, perform NULL check
-//    and atomically set the mark bits to determine if the object needs to be scanned.
-// 2. If yes, it'll push itself back onto the mark stack (after updating fields that are changed)
-//    using `gc_repush_markdata` to increment the stack pointers.
-//    This step can also be replaced by a tail call by finishing up the marking of the current
-//    object when the end of the current object is reached.
-// 3. Jump to `mark`. The marking of the current object will be resumed after the child is
-//    scanned by popping the stack frame back.
-//
-// Some of the special object scannings use BFS to simplify the code (Task and Module).
-
-// The jumps from the dispatch to the scan "loop"s are done by first pushing a frame
-// to the stacks while only increment the data stack pointer before jumping to the loop
-// This way the scan "loop" gets exactly what it expects after a stack pop.
-// Additional optimizations are done for some of the common cases by skipping
-// the unnecessary data stack pointer increment and the load from the stack
-// (i.e. store to load forwarding). See `objary_loaded`, `obj8_loaded` and `obj16_loaded`.
-JL_EXTENSION NOINLINE void gc_mark_loop(jl_ptls_t ptls, jl_gc_mark_sp_t sp)
-{
-    if (__unlikely(ptls == NULL)) {
-        gc_mark_label_addrs[GC_MARK_L_marked_obj] = gc_mark_laddr(marked_obj);
-        gc_mark_label_addrs[GC_MARK_L_scan_only] = gc_mark_laddr(scan_only);
-        gc_mark_label_addrs[GC_MARK_L_finlist] = gc_mark_laddr(finlist);
-        gc_mark_label_addrs[GC_MARK_L_objarray] = gc_mark_laddr(objarray);
-        gc_mark_label_addrs[GC_MARK_L_array8] = gc_mark_laddr(array8);
-        gc_mark_label_addrs[GC_MARK_L_array16] = gc_mark_laddr(array16);
-        gc_mark_label_addrs[GC_MARK_L_obj8] = gc_mark_laddr(obj8);
-        gc_mark_label_addrs[GC_MARK_L_obj16] = gc_mark_laddr(obj16);
-        gc_mark_label_addrs[GC_MARK_L_obj32] = gc_mark_laddr(obj32);
-        gc_mark_label_addrs[GC_MARK_L_stack] = gc_mark_laddr(stack);
-        gc_mark_label_addrs[GC_MARK_L_excstack] = gc_mark_laddr(excstack);
-        gc_mark_label_addrs[GC_MARK_L_module_binding] = gc_mark_laddr(module_binding);
-        return;
+    for (; ary16_begin < scan_end; ary16_begin += elsize) {
+        for (uint16_t *pindex = elem_begin; pindex < elem_end; pindex++) {
+            new_obj = ary16_begin[*pindex];
+            if (new_obj != NULL) {
+                verify_parent2("array", ary16_parent, &new_obj, "elem(%d)",
+                               gc_slot_to_arrayidx(ary16_parent, ary16_begin));
+                gc_try_claim_and_push(mq, new_obj, &nptr);
+                gc_heap_snapshot_record_array_edge(ary16_parent, &new_obj);
+            }
+        }
     }
-
-    jl_value_t *new_obj = NULL;
-    uintptr_t tag = 0;
-    uint8_t bits = 0;
-    int meta_updated = 0;
-
-    gc_mark_objarray_t *objary;
-    jl_value_t **objary_begin;
-    jl_value_t **objary_end;
-
-    gc_mark_array8_t *ary8;
-    gc_mark_array16_t *ary16;
-
-    gc_mark_obj8_t *obj8;
-    char *obj8_parent;
-    uint8_t *obj8_begin;
-    uint8_t *obj8_end;
-
-    gc_mark_obj16_t *obj16;
-    char *obj16_parent;
-    uint16_t *obj16_begin;
-    uint16_t *obj16_end;
-
-pop:
-    if (sp.pc == sp.pc_start) {
-        // TODO: stealing form another thread
-        return;
+    if (too_big) {
+        if (!pushed_chunk) {
+            jl_gc_chunk_t c = {GC_ary16_chunk, ary16_parent, scan_end, ary16_end, elem_begin, elem_end, elsize, nptr};
+            gc_chunkqueue_push(mq, &c);
+        }
     }
-    sp.pc--;
-    gc_mark_jmp(*sp.pc); // computed goto
-
-marked_obj: {
-        // An object that has been marked and needs have metadata updated and scanned.
-        gc_mark_marked_obj_t *obj = gc_pop_markdata(&sp, gc_mark_marked_obj_t);
-        new_obj = obj->obj;
-        tag = obj->tag;
-        bits = obj->bits;
-        goto mark;
-    }
-
-scan_only: {
-        // An object that has been marked and needs to be scanned.
-        gc_mark_marked_obj_t *obj = gc_pop_markdata(&sp, gc_mark_marked_obj_t);
-        new_obj = obj->obj;
-        tag = obj->tag;
-        bits = obj->bits;
-        meta_updated = 1;
-        goto mark;
-    }
-
-objarray:
-    objary = gc_pop_markdata(&sp, gc_mark_objarray_t);
-    objary_begin = objary->begin;
-    objary_end = objary->end;
-objarray_loaded:
-    if (gc_mark_scan_objarray(ptls, &sp, objary, objary_begin, objary_end,
-                              &new_obj, &tag, &bits))
-        goto mark;
-    goto pop;
-
-array8:
-    ary8 = gc_pop_markdata(&sp, gc_mark_array8_t);
-    objary_begin = ary8->begin;
-    objary_end = ary8->end;
-    obj8_begin = ary8->elem.begin;
-    obj8_end = ary8->elem.end;
-array8_loaded:
-    if (gc_mark_scan_array8(ptls, &sp, ary8, objary_begin, objary_end, obj8_begin, obj8_end,
-                            &new_obj, &tag, &bits))
-        goto mark;
-    goto pop;
-
-array16:
-    ary16 = gc_pop_markdata(&sp, gc_mark_array16_t);
-    objary_begin = ary16->begin;
-    objary_end = ary16->end;
-    obj16_begin = ary16->elem.begin;
-    obj16_end = ary16->elem.end;
-array16_loaded:
-    if (gc_mark_scan_array16(ptls, &sp, ary16, objary_begin, objary_end, obj16_begin, obj16_end,
-                            &new_obj, &tag, &bits))
-        goto mark;
-    goto pop;
-
-obj8:
-    obj8 = gc_pop_markdata(&sp, gc_mark_obj8_t);
-    obj8_parent = (char*)obj8->parent;
-    obj8_begin = obj8->begin;
-    obj8_end = obj8->end;
-obj8_loaded:
-    if (gc_mark_scan_obj8(ptls, &sp, obj8, obj8_parent, obj8_begin, obj8_end,
-                          &new_obj, &tag, &bits))
-        goto mark;
-    goto pop;
-
-obj16:
-    obj16 = gc_pop_markdata(&sp, gc_mark_obj16_t);
-    obj16_parent = (char*)obj16->parent;
-    obj16_begin = obj16->begin;
-    obj16_end = obj16->end;
-obj16_loaded:
-    if (gc_mark_scan_obj16(ptls, &sp, obj16, obj16_parent, obj16_begin, obj16_end,
-                           &new_obj, &tag, &bits))
-        goto mark;
-    goto pop;
-
-obj32: {
-        gc_mark_obj32_t *obj32 = gc_pop_markdata(&sp, gc_mark_obj32_t);
-        char *parent = (char*)obj32->parent;
-        uint32_t *begin = obj32->begin;
-        uint32_t *end = obj32->end;
-        if (gc_mark_scan_obj32(ptls, &sp, obj32, parent, begin, end, &new_obj, &tag, &bits))
-            goto mark;
-        goto pop;
+    else {
+        gc_mark_push_remset(ptls, ary16_parent, nptr);
     }
+}
 
-stack: {
-        // Scan the stack. see `gc_mark_stackframe_t`
-        // The task object this stack belongs to is being scanned separately as a normal
-        // 8bit field descriptor object.
-        gc_mark_stackframe_t *stack = gc_pop_markdata(&sp, gc_mark_stackframe_t);
-        jl_gcframe_t *s = stack->s;
-        uint32_t i = stack->i;
-        uint32_t nroots = stack->nroots;
-        uintptr_t offset = stack->offset;
-        uintptr_t lb = stack->lb;
-        uintptr_t ub = stack->ub;
-        uint32_t nr = nroots >> 2;
-        uintptr_t nptr = 0;
-        while (1) {
-            jl_value_t ***rts = (jl_value_t***)(((void**)s) + 2);
-            for (; i < nr; i++) {
-                if (nroots & 1) {
-                    void **slot = (void**)gc_read_stack(&rts[i], offset, lb, ub);
-                    new_obj = (jl_value_t*)gc_read_stack(slot, offset, lb, ub);
-                }
-                else {
-                    new_obj = (jl_value_t*)gc_read_stack(&rts[i], offset, lb, ub);
-                    if (gc_ptr_tag(new_obj, 1)) {
-                        // handle tagged pointers in finalizer list
-                        new_obj = gc_ptr_clear_tag(new_obj, 1);
-                        // skip over the finalizer fptr
-                        i++;
-                    }
-                    if (gc_ptr_tag(new_obj, 2))
-                        continue;
-                }
-                if (!gc_try_setmark(new_obj, &nptr, &tag, &bits))
-                    continue;
-                gc_heap_snapshot_record_frame_to_object_edge(s, new_obj);
-                i++;
-                if (i < nr) {
-                    // Haven't done with this one yet. Update the content and push it back
-                    stack->i = i;
-                    gc_repush_markdata(&sp, gc_mark_stackframe_t);
-                }
-                // TODO stack addresses needs copy stack handling
-                else if ((s = (jl_gcframe_t*)gc_read_stack(&s->prev, offset, lb, ub))) {
-                    gc_heap_snapshot_record_frame_to_frame_edge(stack->s, s);
-                    stack->s = s;
-                    stack->i = 0;
-                    uintptr_t new_nroots = gc_read_stack(&s->nroots, offset, lb, ub);
-                    assert(new_nroots <= UINT32_MAX);
-                    stack->nroots = (uint32_t)new_nroots;
-                    gc_repush_markdata(&sp, gc_mark_stackframe_t);
-                }
-                goto mark;
-            }
-            s = (jl_gcframe_t*)gc_read_stack(&s->prev, offset, lb, ub);
-            // walk up one stack frame
-            if (s != 0) {
-                gc_heap_snapshot_record_frame_to_frame_edge(stack->s, s);
-                stack->s = s;
-                i = 0;
-                uintptr_t new_nroots = gc_read_stack(&s->nroots, offset, lb, ub);
-                assert(new_nroots <= UINT32_MAX);
-                nroots = stack->nroots = (uint32_t)new_nroots;
-                nr = nroots >> 2;
-                continue;
-            }
-            goto pop;
+// Mark chunk of large array
+STATIC_INLINE void gc_mark_chunk(jl_ptls_t ptls, jl_gc_markqueue_t *mq, jl_gc_chunk_t *c) JL_NOTSAFEPOINT
+{
+    switch (c->cid) {
+        case GC_objary_chunk: {
+            jl_value_t *obj_parent = c->parent;
+            jl_value_t **obj_begin = c->begin;
+            jl_value_t **obj_end = c->end;
+            uint32_t step = c->step;
+            uintptr_t nptr = c->nptr;
+            gc_mark_objarray(ptls, obj_parent, obj_begin, obj_end, step,
+                             nptr);
+            break;
         }
-    }
-
-excstack: {
-        // Scan an exception stack
-        gc_mark_excstack_t *stackitr = gc_pop_markdata(&sp, gc_mark_excstack_t);
-        jl_excstack_t *excstack = stackitr->s;
-        size_t itr = stackitr->itr;
-        size_t bt_index = stackitr->bt_index;
-        size_t jlval_index = stackitr->jlval_index;
-        while (itr > 0) {
-            size_t bt_size = jl_excstack_bt_size(excstack, itr);
-            jl_bt_element_t *bt_data = jl_excstack_bt_data(excstack, itr);
-            for (; bt_index < bt_size; bt_index += jl_bt_entry_size(bt_data + bt_index)) {
-                jl_bt_element_t *bt_entry = bt_data + bt_index;
-                if (jl_bt_is_native(bt_entry))
-                    continue;
-                // Found an extended backtrace entry: iterate over any
-                // GC-managed values inside.
-                size_t njlvals = jl_bt_num_jlvals(bt_entry);
-                while (jlval_index < njlvals) {
-                    new_obj = jl_bt_entry_jlvalue(bt_entry, jlval_index);
-                    gc_heap_snapshot_record_frame_to_object_edge(bt_entry, new_obj);
-                    uintptr_t nptr = 0;
-                    jlval_index += 1;
-                    if (gc_try_setmark(new_obj, &nptr, &tag, &bits)) {
-                        stackitr->itr = itr;
-                        stackitr->bt_index = bt_index;
-                        stackitr->jlval_index = jlval_index;
-                        gc_repush_markdata(&sp, gc_mark_excstack_t);
-                        goto mark;
-                    }
-                }
-                jlval_index = 0;
-            }
-            // The exception comes last - mark it
-            new_obj = jl_excstack_exception(excstack, itr);
-            gc_heap_snapshot_record_frame_to_object_edge(excstack, new_obj);
-            itr = jl_excstack_next(excstack, itr);
-            bt_index = 0;
-            jlval_index = 0;
-            uintptr_t nptr = 0;
-            if (gc_try_setmark(new_obj, &nptr, &tag, &bits)) {
-                stackitr->itr = itr;
-                stackitr->bt_index = bt_index;
-                stackitr->jlval_index = jlval_index;
-                gc_repush_markdata(&sp, gc_mark_excstack_t);
-                goto mark;
-            }
+        case GC_ary8_chunk: {
+            jl_value_t *ary8_parent = c->parent;
+            jl_value_t **ary8_begin = c->begin;
+            jl_value_t **ary8_end = c->end;
+            uint8_t *elem_begin = (uint8_t *)c->elem_begin;
+            uint8_t *elem_end = (uint8_t *)c->elem_end;
+            uintptr_t nptr = c->nptr;
+            gc_mark_array8(ptls, ary8_parent, ary8_begin, ary8_end, elem_begin, elem_end,
+                           nptr);
+            break;
+        }
+        case GC_ary16_chunk: {
+            jl_value_t *ary16_parent = c->parent;
+            jl_value_t **ary16_begin = c->begin;
+            jl_value_t **ary16_end = c->end;
+            uint16_t *elem_begin = (uint16_t *)c->elem_begin;
+            uint16_t *elem_end = (uint16_t *)c->elem_end;
+            uintptr_t nptr = c->nptr;
+            gc_mark_array16(ptls, ary16_parent, ary16_begin, ary16_end, elem_begin, elem_end,
+                            nptr);
+            break;
+        }
+        case GC_finlist_chunk: {
+            jl_value_t **fl_begin = c->begin;
+            jl_value_t **fl_end = c->end;
+            gc_mark_finlist_(mq, fl_begin, fl_end);
+            break;
+        }
+        default: {
+            // `empty-chunk` should be checked by caller
+            jl_safe_printf("GC internal error: chunk mismatch\n");
+            abort();
         }
-        goto pop;
     }
+}
 
-module_binding: {
-        // Scan a module. see `gc_mark_binding_t`
-        // Other fields of the module will be scanned after the bindings are scanned
-        gc_mark_binding_t *binding = gc_pop_markdata(&sp, gc_mark_binding_t);
-        jl_binding_t **begin = binding->begin;
-        jl_binding_t **end = binding->end;
-        uint8_t mbits = binding->bits;
-        for (; begin < end; begin += 2) {
-            jl_binding_t *b = *begin;
-            if (b == (jl_binding_t*)HT_NOTFOUND)
-                continue;
-            if (jl_object_in_image((jl_value_t*)b)) {
-                jl_taggedvalue_t *buf = jl_astaggedvalue(b);
-                uintptr_t tag = buf->header;
-                uint8_t bits;
-                if (!gc_marked(tag))
-                    gc_setmark_tag(buf, GC_OLD_MARKED, tag, &bits);
+// Mark gc frame
+STATIC_INLINE void gc_mark_stack(jl_ptls_t ptls, jl_gcframe_t *s, uint32_t nroots, uintptr_t offset,
+                   uintptr_t lb, uintptr_t ub) JL_NOTSAFEPOINT
+{
+    jl_gc_markqueue_t *mq = &ptls->mark_queue;
+    jl_value_t *new_obj;
+    uint32_t nr = nroots >> 2;
+    while (1) {
+        jl_value_t ***rts = (jl_value_t ***)(((void **)s) + 2);
+        for (uint32_t i = 0; i < nr; i++) {
+            if (nroots & 1) {
+                void **slot = (void **)gc_read_stack(&rts[i], offset, lb, ub);
+                new_obj = (jl_value_t *)gc_read_stack(slot, offset, lb, ub);
             }
             else {
-                gc_setmark_buf_(ptls, b, mbits, sizeof(jl_binding_t));
-            }
-            void *vb = jl_astaggedvalue(b);
-            verify_parent1("module", binding->parent, &vb, "binding_buff");
-            // Record the size used for the box for non-const bindings
-            gc_heap_snapshot_record_module_to_binding(binding->parent, b);
-            (void)vb;
-            jl_value_t *ty = jl_atomic_load_relaxed(&b->ty);
-            if (ty && ty != (jl_value_t*)jl_any_type) {
-                verify_parent2("module", binding->parent,
-                               &b->ty, "binding(%s)", jl_symbol_name(b->name));
-                if (gc_try_setmark(ty, &binding->nptr, &tag, &bits)) {
-                    new_obj = ty;
-                    gc_repush_markdata(&sp, gc_mark_binding_t);
-                    goto mark;
-                }
-            }
-            jl_value_t *value = jl_atomic_load_relaxed(&b->value);
-            jl_value_t *globalref = jl_atomic_load_relaxed(&b->globalref);
-            if (value) {
-                verify_parent2("module", binding->parent,
-                               &b->value, "binding(%s)", jl_symbol_name(b->name));
-                if (gc_try_setmark(value, &binding->nptr, &tag, &bits)) {
-                    new_obj = value;
-                    begin += 2;
-                    binding->begin = begin;
-                    gc_repush_markdata(&sp, gc_mark_binding_t);
-                    uintptr_t gr_tag;
-                    uint8_t gr_bits;
-                    if (gc_try_setmark(globalref, &binding->nptr, &gr_tag, &gr_bits)) {
-                        gc_mark_marked_obj_t data = {globalref, gr_tag, gr_bits};
-                        gc_mark_stack_push(&ptls->gc_cache, &sp, gc_mark_laddr(marked_obj),
-                                           &data, sizeof(data), 1);
-                    }
-                    goto mark;
+                new_obj = (jl_value_t *)gc_read_stack(&rts[i], offset, lb, ub);
+                if (gc_ptr_tag(new_obj, 1)) {
+                    // handle tagged pointers in finalizer list
+                    new_obj = (jl_value_t *)gc_ptr_clear_tag(new_obj, 1);
+                    // skip over the finalizer fptr
+                    i++;
                 }
+                if (gc_ptr_tag(new_obj, 2))
+                    continue;
             }
-            if (gc_try_setmark(globalref, &binding->nptr, &tag, &bits)) {
-                begin += 2;
-                binding->begin = begin;
-                gc_repush_markdata(&sp, gc_mark_binding_t);
-                new_obj = globalref;
-                goto mark;
+            if (new_obj != NULL) {
+                gc_try_claim_and_push(mq, new_obj, NULL);
+                gc_heap_snapshot_record_frame_to_object_edge(s, new_obj);
             }
         }
-        jl_module_t *m = binding->parent;
-        int scanparent = gc_try_setmark((jl_value_t*)m->parent, &binding->nptr, &tag, &bits);
-        size_t nusings = m->usings.len;
-        if (nusings) {
-            // this is only necessary because bindings for "using" modules
-            // are added only when accessed. therefore if a module is replaced
-            // after "using" it but before accessing it, this array might
-            // contain the only reference.
-            objary_begin = (jl_value_t**)m->usings.items;
-            objary_end = objary_begin + nusings;
-            gc_mark_objarray_t data = {(jl_value_t*)m, objary_begin, objary_end, 1, binding->nptr};
-            gc_mark_stack_push(&ptls->gc_cache, &sp, gc_mark_laddr(objarray),
-                               &data, sizeof(data), 0);
-            if (!scanparent) {
-                objary = (gc_mark_objarray_t*)sp.data;
-                goto objarray_loaded;
+        jl_gcframe_t *sprev = (jl_gcframe_t *)gc_read_stack(&s->prev, offset, lb, ub);
+        if (sprev == NULL)
+            break;
+        gc_heap_snapshot_record_frame_to_frame_edge(s, sprev);
+        s = sprev;
+        uintptr_t new_nroots = gc_read_stack(&s->nroots, offset, lb, ub);
+        assert(new_nroots <= UINT32_MAX);
+        nroots = (uint32_t)new_nroots;
+        nr = nroots >> 2;
+    }
+}
+
+// Mark exception stack
+STATIC_INLINE void gc_mark_excstack(jl_ptls_t ptls, jl_excstack_t *excstack, size_t itr) JL_NOTSAFEPOINT
+{
+    jl_gc_markqueue_t *mq = &ptls->mark_queue;
+    jl_value_t *new_obj;
+    while (itr > 0) {
+        size_t bt_size = jl_excstack_bt_size(excstack, itr);
+        jl_bt_element_t *bt_data = jl_excstack_bt_data(excstack, itr);
+        for (size_t bt_index = 0; bt_index < bt_size;
+             bt_index += jl_bt_entry_size(bt_data + bt_index)) {
+            jl_bt_element_t *bt_entry = bt_data + bt_index;
+            if (jl_bt_is_native(bt_entry))
+                continue;
+            // Found an extended backtrace entry: iterate over any
+            // GC-managed values inside.
+            size_t njlvals = jl_bt_num_jlvals(bt_entry);
+            for (size_t jlval_index = 0; jlval_index < njlvals; jlval_index++) {
+                new_obj = jl_bt_entry_jlvalue(bt_entry, jlval_index);
+                gc_try_claim_and_push(mq, new_obj, NULL);
+                gc_heap_snapshot_record_frame_to_object_edge(bt_entry, new_obj);
             }
-            sp.data = (jl_gc_mark_data_t *)(((char*)sp.data) + sizeof(data));
-            sp.pc++;
+        }
+        // The exception comes last - mark it
+        new_obj = jl_excstack_exception(excstack, itr);
+        itr = jl_excstack_next(excstack, itr);
+        gc_try_claim_and_push(mq, new_obj, NULL);
+        gc_heap_snapshot_record_frame_to_object_edge(excstack, new_obj);
+    }
+}
+
+// Mark module binding
+STATIC_INLINE void gc_mark_module_binding(jl_ptls_t ptls, jl_module_t *mb_parent, jl_binding_t **mb_begin,
+                            jl_binding_t **mb_end, uintptr_t nptr,
+                            uint8_t bits) JL_NOTSAFEPOINT
+{
+    jl_gc_markqueue_t *mq = &ptls->mark_queue;
+    for (; mb_begin < mb_end; mb_begin += 2) {
+        jl_binding_t *b = *mb_begin;
+        if (b == (jl_binding_t *)HT_NOTFOUND)
+            continue;
+        if (jl_object_in_image((jl_value_t*)b)) {
+            jl_taggedvalue_t *buf = jl_astaggedvalue(b);
+            gc_try_setmark_tag(buf, GC_OLD_MARKED);
         }
         else {
-            gc_mark_push_remset(ptls, (jl_value_t*)m, binding->nptr);
+            gc_setmark_buf_(ptls, b, bits, sizeof(jl_binding_t));
         }
-        if (scanparent) {
-            new_obj = (jl_value_t*)m->parent;
-            goto mark;
+        void *vb = jl_astaggedvalue(b);
+        verify_parent1("module", binding->parent, &vb, "binding_buff");
+        // Record the size used for the box for non-const bindings
+        gc_heap_snapshot_record_module_to_binding(mb_parent, b);
+        (void)vb;
+        jl_value_t *ty = jl_atomic_load_relaxed(&b->ty);
+        if (ty && ty != (jl_value_t*)jl_any_type) {
+            verify_parent2("module", binding->parent,
+                           &b->ty, "binding(%s)", jl_symbol_name(b->name));
+            gc_try_claim_and_push(mq, ty, &nptr);
         }
-        goto pop;
+        jl_value_t *value = jl_atomic_load_relaxed(&b->value);
+        if (value) {
+            verify_parent2("module", binding->parent,
+                           &b->value, "binding(%s)", jl_symbol_name(b->name));
+            gc_try_claim_and_push(mq, value, &nptr);
+        }
+        jl_value_t *globalref = jl_atomic_load_relaxed(&b->globalref);
+        gc_try_claim_and_push(mq, globalref, &nptr);
+    }
+    gc_try_claim_and_push(mq, (jl_value_t *)mb_parent->parent, &nptr);
+    size_t nusings = mb_parent->usings.len;
+    if (nusings > 0) {
+        // this is only necessary because bindings for "using" modules
+        // are added only when accessed. therefore if a module is replaced
+        // after "using" it but before accessing it, this array might
+        // contain the only reference.
+        jl_value_t *obj_parent = (jl_value_t *)mb_parent;
+        jl_value_t **objary_begin = (jl_value_t **)mb_parent->usings.items;
+        jl_value_t **objary_end = objary_begin + nusings;
+        gc_mark_objarray(ptls, obj_parent, objary_begin, objary_end, 1, nptr);
+    }
+    else {
+        gc_mark_push_remset(ptls, (jl_value_t *)mb_parent, nptr);
     }
+}
 
-finlist: {
-        // Scan a finalizer (or format compatible) list. see `gc_mark_finlist_t`
-        gc_mark_finlist_t *finlist = gc_pop_markdata(&sp, gc_mark_finlist_t);
-        jl_value_t **begin = finlist->begin;
-        jl_value_t **end = finlist->end;
-        for (; begin < end; begin++) {
-            new_obj = *begin;
-            if (__unlikely(!new_obj))
-                continue;
-            if (gc_ptr_tag(new_obj, 1)) {
-                new_obj = (jl_value_t*)gc_ptr_clear_tag(new_obj, 1);
-                begin++;
-                assert(begin < end);
-            }
-            if (gc_ptr_tag(new_obj, 2))
-                continue;
-            uintptr_t nptr = 0;
-            if (!gc_try_setmark(new_obj, &nptr, &tag, &bits))
-                continue;
-            begin++;
-            // Found an object to mark
-            if (begin < end) {
-                // Haven't done with this one yet. Update the content and push it back
-                finlist->begin = begin;
-                gc_repush_markdata(&sp, gc_mark_finlist_t);
-            }
-            goto mark;
+void gc_mark_finlist_(jl_gc_markqueue_t *mq, jl_value_t **fl_begin, jl_value_t **fl_end)
+{
+    jl_value_t *new_obj;
+    // Decide whether need to chunk finlist
+    size_t nrefs = (fl_end - fl_begin);
+    if (nrefs > GC_CHUNK_BATCH_SIZE) {
+        jl_gc_chunk_t c = {GC_finlist_chunk, NULL, fl_begin + GC_CHUNK_BATCH_SIZE, fl_end, 0, 0, 0, 0};
+        gc_chunkqueue_push(mq, &c);
+        fl_end = fl_begin + GC_CHUNK_BATCH_SIZE;
+    }
+    for (; fl_begin < fl_end; fl_begin++) {
+        new_obj = *fl_begin;
+        if (__unlikely(!new_obj))
+            continue;
+        if (gc_ptr_tag(new_obj, 1)) {
+            new_obj = (jl_value_t *)gc_ptr_clear_tag(new_obj, 1);
+            fl_begin++;
+            assert(fl_begin < fl_end);
         }
-        goto pop;
+        if (gc_ptr_tag(new_obj, 2))
+            continue;
+        gc_try_claim_and_push(mq, new_obj, NULL);
     }
+}
 
-mark: {
-        // Generic scanning entry point.
-        // Expects `new_obj`, `tag` and `bits` to be set correctly.
-#ifdef JL_DEBUG_BUILD
+// Mark finalizer list (or list of objects following same format)
+void gc_mark_finlist(jl_gc_markqueue_t *mq, arraylist_t *list, size_t start)
+{
+    size_t len = list->len;
+    if (len <= start)
+        return;
+    jl_value_t **fl_begin = (jl_value_t **)list->items + start;
+    jl_value_t **fl_end = (jl_value_t **)list->items + len;
+    gc_mark_finlist_(mq, fl_begin, fl_end);
+}
+
+JL_DLLEXPORT int jl_gc_mark_queue_obj(jl_ptls_t ptls, jl_value_t *obj)
+{
+    int may_claim = gc_try_setmark_tag(jl_astaggedvalue(obj), GC_MARKED);
+    if (may_claim)
+        gc_ptr_queue_push(&ptls->mark_queue, obj);
+    return may_claim;
+}
+
+JL_DLLEXPORT void jl_gc_mark_queue_objarray(jl_ptls_t ptls, jl_value_t *parent,
+                                            jl_value_t **objs, size_t nobjs)
+{
+    uintptr_t nptr = (nobjs << 2) | (jl_astaggedvalue(parent)->bits.gc & 2);
+    gc_mark_objarray(ptls, parent, objs, objs + nobjs, 1, nptr);
+}
+
+// Enqueue and mark all outgoing references from `new_obj` which have not been marked
+// yet. `meta_updated` is mostly used to make sure we don't update metadata twice for
+// objects which have been enqueued into the `remset`
+FORCE_INLINE void gc_mark_outrefs(jl_ptls_t ptls, jl_gc_markqueue_t *mq, void *_new_obj,
+                              int meta_updated)
+{
+    jl_value_t *new_obj = (jl_value_t *)_new_obj;
+    mark_obj: {
+    #ifdef JL_DEBUG_BUILD
         if (new_obj == gc_findval)
             jl_raise_debugger();
-#endif
+    #endif
         jl_taggedvalue_t *o = jl_astaggedvalue(new_obj);
-        jl_datatype_t *vt = (jl_datatype_t*)tag;
-        int foreign_alloc = 0;
+        jl_datatype_t *vt = (jl_datatype_t *)(o->header & ~(uintptr_t)0xf);
+        uint8_t bits = (gc_old(o->header) && !mark_reset_age) ? GC_OLD_MARKED : GC_MARKED;
         int update_meta = __likely(!meta_updated && !gc_verifying);
-        if (update_meta && o->bits.in_image) {
+        int foreign_alloc = 0;
+        if (update_meta && jl_object_in_image(new_obj)) {
             foreign_alloc = 1;
             update_meta = 0;
         }
-        meta_updated = 0;
         // Symbols are always marked
         assert(vt != jl_symbol_type);
         if (vt == jl_simplevector_type) {
             size_t l = jl_svec_len(new_obj);
             jl_value_t **data = jl_svec_data(new_obj);
-            size_t dtsz = l * sizeof(void*) + sizeof(jl_svec_t);
+            size_t dtsz = l * sizeof(void *) + sizeof(jl_svec_t);
             if (update_meta)
                 gc_setmark(ptls, o, bits, dtsz);
             else if (foreign_alloc)
                 objprofile_count(vt, bits == GC_OLD_MARKED, dtsz);
+            jl_value_t *objary_parent = new_obj;
+            jl_value_t **objary_begin = data;
+            jl_value_t **objary_end = data + l;
+            uint32_t step = 1;
             uintptr_t nptr = (l << 2) | (bits & GC_OLD);
-            objary_begin = data;
-            objary_end = data + l;
-            gc_mark_objarray_t markdata = {new_obj, objary_begin, objary_end, 1, nptr};
-            gc_mark_stack_push(&ptls->gc_cache, &sp, gc_mark_laddr(objarray),
-                               &markdata, sizeof(markdata), 0);
-            objary = (gc_mark_objarray_t*)sp.data;
-            goto objarray_loaded;
+            gc_mark_objarray(ptls, objary_parent, objary_begin, objary_end, step, nptr);
         }
         else if (vt->name == jl_array_typename) {
-            jl_array_t *a = (jl_array_t*)new_obj;
+            jl_array_t *a = (jl_array_t *)new_obj;
             jl_array_flags_t flags = a->flags;
             if (update_meta) {
                 if (flags.pooled)
@@ -2849,9 +2556,10 @@ mark: {
                 else
                     gc_setmark_big(ptls, o, bits);
             }
-            else if (foreign_alloc)
+            else if (foreign_alloc) {
                 objprofile_count(vt, bits == GC_OLD_MARKED, sizeof(jl_array_t));
-            if (flags.how ==0){
+            }
+            if (flags.how == 0) {
                 void *data_ptr = (char*)a + sizeof(jl_array_t) +jl_array_ndimwords(a->flags.ndims) * sizeof(size_t);
                 gc_heap_snapshot_record_hidden_edge(new_obj, data_ptr, jl_array_nbytes(a), 2);
             }
@@ -2879,102 +2587,81 @@ mark: {
             else if (flags.how == 3) {
                 jl_value_t *owner = jl_array_data_owner(a);
                 uintptr_t nptr = (1 << 2) | (bits & GC_OLD);
+                gc_try_claim_and_push(mq, owner, &nptr);
                 gc_heap_snapshot_record_internal_array_edge(new_obj, owner);
-                int markowner = gc_try_setmark(owner, &nptr, &tag, &bits);
                 gc_mark_push_remset(ptls, new_obj, nptr);
-                if (markowner) {
-                    new_obj = owner;
-                    goto mark;
-                }
-                goto pop;
+                return;
             }
-            if (a->data == NULL || jl_array_len(a) == 0)
-                goto pop;
+            if (!a->data || jl_array_len(a) == 0)
+                return;
             if (flags.ptrarray) {
-                if ((jl_datatype_t*)jl_tparam0(vt) == jl_symbol_type)
-                    goto pop;
+                if ((jl_datatype_t *)jl_tparam0(vt) == jl_symbol_type)
+                    return;
                 size_t l = jl_array_len(a);
+                jl_value_t *objary_parent = new_obj;
+                jl_value_t **objary_begin = (jl_value_t **)a->data;
+                jl_value_t **objary_end = objary_begin + l;
+                uint32_t step = 1;
                 uintptr_t nptr = (l << 2) | (bits & GC_OLD);
-                objary_begin = (jl_value_t**)a->data;
-                objary_end = objary_begin + l;
-                gc_mark_objarray_t markdata = {new_obj, objary_begin, objary_end, 1, nptr};
-                gc_mark_stack_push(&ptls->gc_cache, &sp, gc_mark_laddr(objarray),
-                                   &markdata, sizeof(markdata), 0);
-                objary = (gc_mark_objarray_t*)sp.data;
-                goto objarray_loaded;
+                gc_mark_objarray(ptls, objary_parent, objary_begin, objary_end, step, nptr);
             }
             else if (flags.hasptr) {
-                jl_datatype_t *et = (jl_datatype_t*)jl_tparam0(vt);
+                jl_datatype_t *et = (jl_datatype_t *)jl_tparam0(vt);
                 const jl_datatype_layout_t *layout = et->layout;
                 unsigned npointers = layout->npointers;
-                unsigned elsize = a->elsize / sizeof(jl_value_t*);
+                unsigned elsize = a->elsize / sizeof(jl_value_t *);
                 size_t l = jl_array_len(a);
+                jl_value_t *objary_parent = new_obj;
+                jl_value_t **objary_begin = (jl_value_t **)a->data;
+                jl_value_t **objary_end = objary_begin + l * elsize;
+                uint32_t step = elsize;
                 uintptr_t nptr = ((l * npointers) << 2) | (bits & GC_OLD);
-                objary_begin = (jl_value_t**)a->data;
-                objary_end = objary_begin + l * elsize;
                 if (npointers == 1) { // TODO: detect anytime time stride is uniform?
                     objary_begin += layout->first_ptr;
-                    gc_mark_objarray_t markdata = {new_obj, objary_begin, objary_end, elsize, nptr};
-                    gc_mark_stack_push(&ptls->gc_cache, &sp, gc_mark_laddr(objarray),
-                                       &markdata, sizeof(markdata), 0);
-                    objary = (gc_mark_objarray_t*)sp.data;
-                    goto objarray_loaded;
+                    gc_mark_objarray(ptls, objary_parent, objary_begin, objary_end, step, nptr);
                 }
                 else if (layout->fielddesc_type == 0) {
-                    obj8_begin = (uint8_t*)jl_dt_layout_ptrs(layout);
-                    obj8_end = obj8_begin + npointers;
-                    gc_mark_array8_t markdata = {objary_begin, objary_end, obj8_begin, {new_obj, obj8_begin, obj8_end, nptr}};
-                    gc_mark_stack_push(&ptls->gc_cache, &sp, gc_mark_laddr(array8),
-                                       &markdata, sizeof(markdata), 0);
-                    ary8 = (gc_mark_array8_t*)sp.data;
-                    goto array8_loaded;
+                    uint8_t *obj8_begin = (uint8_t *)jl_dt_layout_ptrs(layout);
+                    uint8_t *obj8_end = obj8_begin + npointers;
+                    gc_mark_array8(ptls, objary_parent, objary_begin, objary_end, obj8_begin,
+                                   obj8_end, nptr);
                 }
                 else if (layout->fielddesc_type == 1) {
-                    obj16_begin = (uint16_t*)jl_dt_layout_ptrs(layout);
-                    obj16_end = obj16_begin + npointers;
-                    gc_mark_array16_t markdata = {objary_begin, objary_end, obj16_begin, {new_obj, obj16_begin, obj16_end, nptr}};
-                    gc_mark_stack_push(&ptls->gc_cache, &sp, gc_mark_laddr(array16),
-                                       &markdata, sizeof(markdata), 0);
-                    ary16 = (gc_mark_array16_t*)sp.data;
-                    goto array16_loaded;
+                    uint16_t *obj16_begin = (uint16_t *)jl_dt_layout_ptrs(layout);
+                    uint16_t *obj16_end = obj16_begin + npointers;
+                    gc_mark_array16(ptls, objary_parent, objary_begin, objary_end, obj16_begin,
+                                    obj16_end, nptr);
                 }
                 else {
                     assert(0 && "unimplemented");
                 }
             }
-            goto pop;
         }
         else if (vt == jl_module_type) {
             if (update_meta)
                 gc_setmark(ptls, o, bits, sizeof(jl_module_t));
             else if (foreign_alloc)
                 objprofile_count(vt, bits == GC_OLD_MARKED, sizeof(jl_module_t));
-            jl_module_t *m = (jl_module_t*)new_obj;
-            jl_binding_t **table = (jl_binding_t**)m->bindings.table;
-            size_t bsize = m->bindings.size;
-            uintptr_t nptr = ((bsize + m->usings.len + 1) << 2) | (bits & GC_OLD);
-            gc_mark_binding_t markdata = {m, table + 1, table + bsize, nptr, bits};
-            gc_mark_stack_push(&ptls->gc_cache, &sp, gc_mark_laddr(module_binding),
-                               &markdata, sizeof(markdata), 0);
-            sp.data = (jl_gc_mark_data_t *)(((char*)sp.data) + sizeof(markdata));
-            goto module_binding;
+            jl_module_t *mb_parent = (jl_module_t *)new_obj;
+            jl_binding_t **mb_begin = (jl_binding_t **)mb_parent->bindings.table + 1;
+            size_t bsize = mb_parent->bindings.size;
+            jl_binding_t **mb_end = (jl_binding_t **)mb_parent->bindings.table + bsize;
+            uintptr_t nptr = ((bsize + mb_parent->usings.len + 1) << 2) | (bits & GC_OLD);
+            gc_mark_module_binding(ptls, mb_parent, mb_begin, mb_end, nptr, bits);
         }
         else if (vt == jl_task_type) {
             if (update_meta)
                 gc_setmark(ptls, o, bits, sizeof(jl_task_t));
             else if (foreign_alloc)
                 objprofile_count(vt, bits == GC_OLD_MARKED, sizeof(jl_task_t));
-            jl_task_t *ta = (jl_task_t*)new_obj;
+            jl_task_t *ta = (jl_task_t *)new_obj;
             gc_scrub_record_task(ta);
             if (gc_cblist_task_scanner) {
-                export_gc_state(ptls, &sp);
                 int16_t tid = jl_atomic_load_relaxed(&ta->tid);
-                gc_invoke_callbacks(jl_gc_cb_task_scanner_t,
-                    gc_cblist_task_scanner,
-                    (ta, tid != -1 && ta == gc_all_tls_states[tid]->root_task));
-                import_gc_state(ptls, &sp);
+                gc_invoke_callbacks(jl_gc_cb_task_scanner_t, gc_cblist_task_scanner,
+                                    (ta, tid != -1 && ta == gc_all_tls_states[tid]->root_task));
             }
-#ifdef COPY_STACKS
+    #ifdef COPY_STACKS
             void *stkbuf = ta->stkbuf;
             if (stkbuf && ta->copy_stack) {
                 gc_setmark_buf_(ptls, stkbuf, bits, ta->bufsz);
@@ -2983,14 +2670,14 @@ mark: {
                 // TODO: edge to stack data
                 // TODO: synthetic node for stack data (how big is it?)
             }
-#endif
+    #endif
             jl_gcframe_t *s = ta->gcstack;
             size_t nroots;
             uintptr_t offset = 0;
             uintptr_t lb = 0;
             uintptr_t ub = (uintptr_t)-1;
-#ifdef COPY_STACKS
-            if (stkbuf && ta->copy_stack && ta->ptls == NULL) {
+    #ifdef COPY_STACKS
+            if (stkbuf && ta->copy_stack && !ta->ptls) {
                 int16_t tid = jl_atomic_load_relaxed(&ta->tid);
                 assert(tid >= 0);
                 jl_ptls_t ptls2 = gc_all_tls_states[tid];
@@ -2998,38 +2685,38 @@ mark: {
                 lb = ub - ta->copy_stack;
                 offset = (uintptr_t)stkbuf - lb;
             }
-#endif
-            if (s) {
+    #endif
+            if (s != NULL) {
                 nroots = gc_read_stack(&s->nroots, offset, lb, ub);
                 gc_heap_snapshot_record_task_to_frame_edge(ta, s);
-
                 assert(nroots <= UINT32_MAX);
-                gc_mark_stackframe_t stackdata = {s, 0, (uint32_t)nroots, offset, lb, ub};
-                gc_mark_stack_push(&ptls->gc_cache, &sp, gc_mark_laddr(stack),
-                                   &stackdata, sizeof(stackdata), 1);
+                gc_mark_stack(ptls, s, (uint32_t)nroots, offset, lb, ub);
             }
             if (ta->excstack) {
-                gc_heap_snapshot_record_task_to_frame_edge(ta, ta->excstack);
-                gc_setmark_buf_(ptls, ta->excstack, bits, sizeof(jl_excstack_t) +
-                                sizeof(uintptr_t)*ta->excstack->reserved_size);
-                gc_mark_excstack_t stackdata = {ta->excstack, ta->excstack->top, 0, 0};
-                gc_mark_stack_push(&ptls->gc_cache, &sp, gc_mark_laddr(excstack),
-                                   &stackdata, sizeof(stackdata), 1);
+                jl_excstack_t *excstack = ta->excstack;
+                gc_heap_snapshot_record_task_to_frame_edge(ta, excstack);
+                size_t itr = ta->excstack->top;
+                gc_setmark_buf_(ptls, excstack, bits,
+                                sizeof(jl_excstack_t) +
+                                    sizeof(uintptr_t) * excstack->reserved_size);
+                gc_mark_excstack(ptls, excstack, itr);
             }
             const jl_datatype_layout_t *layout = jl_task_type->layout;
             assert(layout->fielddesc_type == 0);
             assert(layout->nfields > 0);
             uint32_t npointers = layout->npointers;
-            obj8_begin = (uint8_t*)jl_dt_layout_ptrs(layout);
-            obj8_end = obj8_begin + npointers;
+            char *obj8_parent = (char *)ta;
+            uint8_t *obj8_begin = (uint8_t *)jl_dt_layout_ptrs(layout);
+            uint8_t *obj8_end = obj8_begin + npointers;
             // assume tasks always reference young objects: set lowest bit
             uintptr_t nptr = (npointers << 2) | 1 | bits;
-            gc_mark_obj8_t markdata = {new_obj, obj8_begin, obj8_end, nptr};
-            gc_mark_stack_push(&ptls->gc_cache, &sp, gc_mark_laddr(obj8),
-                               &markdata, sizeof(markdata), 0);
-            obj8 = (gc_mark_obj8_t*)sp.data;
-            obj8_parent = (char*)ta;
-            goto obj8_loaded;
+            new_obj = gc_mark_obj8(ptls, obj8_parent, obj8_begin, obj8_end, nptr);
+            if (new_obj != NULL) {
+                if (!meta_updated)
+                    goto mark_obj;
+                else
+                    gc_ptr_queue_push(mq, new_obj);
+            }
         }
         else if (vt == jl_string_type) {
             size_t dtsz = jl_string_len(new_obj) + sizeof(size_t) + 1;
@@ -3037,140 +2724,388 @@ mark: {
                 gc_setmark(ptls, o, bits, dtsz);
             else if (foreign_alloc)
                 objprofile_count(vt, bits == GC_OLD_MARKED, dtsz);
-            goto pop;
         }
         else {
             if (__unlikely(!jl_is_datatype(vt)))
-                gc_assert_datatype_fail(ptls, vt, sp);
+                gc_assert_datatype_fail(ptls, vt, mq);
             size_t dtsz = jl_datatype_size(vt);
             if (update_meta)
                 gc_setmark(ptls, o, bits, dtsz);
             else if (foreign_alloc)
                 objprofile_count(vt, bits == GC_OLD_MARKED, dtsz);
             if (vt == jl_weakref_type)
-                goto pop;
+                return;
             const jl_datatype_layout_t *layout = vt->layout;
             uint32_t npointers = layout->npointers;
             if (npointers == 0)
-                goto pop;
-            uintptr_t nptr = npointers << 2 | (bits & GC_OLD);
-            assert((layout->nfields > 0 || layout->fielddesc_type == 3) && "opaque types should have been handled specially");
+                return;
+            uintptr_t nptr = (npointers << 2 | (bits & GC_OLD));
+            assert((layout->nfields > 0 || layout->fielddesc_type == 3) &&
+                   "opaque types should have been handled specially");
             if (layout->fielddesc_type == 0) {
-                obj8_parent = (char*)new_obj;
-                obj8_begin = (uint8_t*)jl_dt_layout_ptrs(layout);
-                obj8_end = obj8_begin + npointers;
+                char *obj8_parent = (char *)new_obj;
+                uint8_t *obj8_begin = (uint8_t *)jl_dt_layout_ptrs(layout);
+                uint8_t *obj8_end = obj8_begin + npointers;
                 assert(obj8_begin < obj8_end);
-                gc_mark_obj8_t markdata = {new_obj, obj8_begin, obj8_end, nptr};
-                gc_mark_stack_push(&ptls->gc_cache, &sp, gc_mark_laddr(obj8),
-                                   &markdata, sizeof(markdata), 0);
-                obj8 = (gc_mark_obj8_t*)sp.data;
-                goto obj8_loaded;
+                new_obj = gc_mark_obj8(ptls, obj8_parent, obj8_begin, obj8_end, nptr);
+                if (new_obj != NULL) {
+                    if (!meta_updated)
+                        goto mark_obj;
+                    else
+                        gc_ptr_queue_push(mq, new_obj);
+                }
             }
             else if (layout->fielddesc_type == 1) {
-                obj16_parent = (char*)new_obj;
-                obj16_begin = (uint16_t*)jl_dt_layout_ptrs(layout);
-                obj16_end = obj16_begin + npointers;
+                char *obj16_parent = (char *)new_obj;
+                uint16_t *obj16_begin = (uint16_t *)jl_dt_layout_ptrs(layout);
+                uint16_t *obj16_end = obj16_begin + npointers;
                 assert(obj16_begin < obj16_end);
-                gc_mark_obj16_t markdata = {new_obj, obj16_begin, obj16_end, nptr};
-                gc_mark_stack_push(&ptls->gc_cache, &sp, gc_mark_laddr(obj16),
-                                   &markdata, sizeof(markdata), 0);
-                obj16 = (gc_mark_obj16_t*)sp.data;
-                goto obj16_loaded;
+                new_obj = gc_mark_obj16(ptls, obj16_parent, obj16_begin, obj16_end, nptr);
+                if (new_obj != NULL) {
+                    if (!meta_updated)
+                        goto mark_obj;
+                    else
+                        gc_ptr_queue_push(mq, new_obj);
+                }
             }
             else if (layout->fielddesc_type == 2) {
                 // This is very uncommon
                 // Do not do store to load forwarding to save some code size
-                uint32_t *obj32_begin = (uint32_t*)jl_dt_layout_ptrs(layout);
+                char *obj32_parent = (char *)new_obj;
+                uint32_t *obj32_begin = (uint32_t *)jl_dt_layout_ptrs(layout);
                 uint32_t *obj32_end = obj32_begin + npointers;
-                gc_mark_obj32_t markdata = {new_obj, obj32_begin, obj32_end, nptr};
-                gc_mark_stack_push(&ptls->gc_cache, &sp, gc_mark_laddr(obj32),
-                                   &markdata, sizeof(markdata), 0);
-                sp.data = (jl_gc_mark_data_t *)(((char*)sp.data) + sizeof(markdata));
-                goto obj32;
+                assert(obj32_begin < obj32_end);
+                new_obj = gc_mark_obj32(ptls, obj32_parent, obj32_begin, obj32_end, nptr);
+                if (new_obj != NULL) {
+                    if (!meta_updated)
+                        goto mark_obj;
+                    else
+                        gc_ptr_queue_push(mq, new_obj);
+                }
             }
             else {
                 assert(layout->fielddesc_type == 3);
-                jl_fielddescdyn_t *desc = (jl_fielddescdyn_t*)jl_dt_layout_fields(layout);
+                jl_fielddescdyn_t *desc = (jl_fielddescdyn_t *)jl_dt_layout_fields(layout);
                 int old = jl_astaggedvalue(new_obj)->bits.gc & 2;
-                export_gc_state(ptls, &sp);
                 uintptr_t young = desc->markfunc(ptls, new_obj);
-                import_gc_state(ptls, &sp);
                 if (old && young)
                     gc_mark_push_remset(ptls, new_obj, young * 4 + 3);
+            }
+        }
+    }
+}
+
+// Used in gc-debug
+void gc_mark_loop_serial_(jl_ptls_t ptls, jl_gc_markqueue_t *mq)
+{
+    while (1) {
+        void *new_obj = (void *)gc_ptr_queue_pop(&ptls->mark_queue);
+        // No more objects to mark
+        if (__unlikely(new_obj == NULL)) {
+            return;
+        }
+        gc_mark_outrefs(ptls, mq, new_obj, 0);
+    }
+}
+
+// Drain items from worker's own chunkqueue
+void gc_drain_own_chunkqueue(jl_ptls_t ptls, jl_gc_markqueue_t *mq)
+{
+    jl_gc_chunk_t c = {.cid = GC_empty_chunk};
+    do {
+        c = gc_chunkqueue_pop(mq);
+        if (c.cid != GC_empty_chunk) {
+            gc_mark_chunk(ptls, mq, &c);
+            gc_mark_loop_serial_(ptls, mq);
+        }
+    } while (c.cid != GC_empty_chunk);
+}
+
+// Main mark loop. Stack (allocated on the heap) of `jl_value_t *`
+// is used to keep track of processed items. Maintaning this stack (instead of
+// native one) avoids stack overflow when marking deep objects and
+// makes it easier to implement parallel marking via work-stealing
+JL_EXTENSION NOINLINE void gc_mark_loop_serial(jl_ptls_t ptls)
+{
+    gc_mark_loop_serial_(ptls, &ptls->mark_queue);
+    gc_drain_own_chunkqueue(ptls, &ptls->mark_queue);
+}
+
+void gc_mark_and_steal(jl_ptls_t ptls)
+{
+    jl_gc_markqueue_t *mq = &ptls->mark_queue;
+    jl_gc_markqueue_t *mq_master = NULL;
+    int master_tid = jl_atomic_load(&gc_master_tid);
+    if (master_tid != -1)
+        mq_master = &gc_all_tls_states[master_tid]->mark_queue;
+    void *new_obj;
+    jl_gc_chunk_t c;
+    pop : {
+        new_obj = gc_ptr_queue_pop(mq);
+        if (new_obj != NULL) {
+            goto mark;
+        }
+        c = gc_chunkqueue_pop(mq);
+        if (c.cid != GC_empty_chunk) {
+            gc_mark_chunk(ptls, mq, &c);
+            goto pop;
+        }
+        goto steal;
+    }
+    mark : {
+        gc_mark_outrefs(ptls, mq, new_obj, 0);
+        goto pop;
+    }
+    // Note that for the stealing heuristics, we try to
+    // steal chunks much more agressively than pointers,
+    // since we know chunks will likely expand into a lot
+    // of work for the mark loop
+    steal : {
+        // Try to steal chunk from random GC thread
+        for (int i = 0; i < 4 * jl_n_gcthreads; i++) {
+            uint32_t v = gc_first_tid + cong(UINT64_MAX, UINT64_MAX, &ptls->rngseed) % jl_n_gcthreads;
+            jl_gc_markqueue_t *mq2 = &gc_all_tls_states[v]->mark_queue;
+            c = gc_chunkqueue_steal_from(mq2);
+            if (c.cid != GC_empty_chunk) {
+                gc_mark_chunk(ptls, mq, &c);
+                goto pop;
+            }
+        }
+        // Sequentially walk GC threads to try to steal chunk
+        for (int i = gc_first_tid; i < gc_first_tid + jl_n_gcthreads; i++) {
+            jl_gc_markqueue_t *mq2 = &gc_all_tls_states[i]->mark_queue;
+            c = gc_chunkqueue_steal_from(mq2);
+            if (c.cid != GC_empty_chunk) {
+                gc_mark_chunk(ptls, mq, &c);
                 goto pop;
             }
         }
+        // Try to steal chunk from master thread
+        if (mq_master != NULL) {
+            c = gc_chunkqueue_steal_from(mq_master);
+            if (c.cid != GC_empty_chunk) {
+                gc_mark_chunk(ptls, mq, &c);
+                goto pop;
+            }
+        }
+        // Try to steal pointer from random GC thread
+        for (int i = 0; i < 4 * jl_n_gcthreads; i++) {
+            uint32_t v = gc_first_tid + cong(UINT64_MAX, UINT64_MAX, &ptls->rngseed) % jl_n_gcthreads;
+            jl_gc_markqueue_t *mq2 = &gc_all_tls_states[v]->mark_queue;
+            new_obj = gc_ptr_queue_steal_from(mq2);
+            if (new_obj != NULL)
+                goto mark;
+        }
+        // Sequentially walk GC threads to try to steal pointer
+        for (int i = gc_first_tid; i < gc_first_tid + jl_n_gcthreads; i++) {
+            jl_gc_markqueue_t *mq2 = &gc_all_tls_states[i]->mark_queue;
+            new_obj = gc_ptr_queue_steal_from(mq2);
+            if (new_obj != NULL)
+                goto mark;
+        }
+        // Try to steal pointer from master thread
+        if (mq_master != NULL) {
+            new_obj = gc_ptr_queue_steal_from(mq_master);
+            if (new_obj != NULL)
+                goto mark;
+        }
+    }
+}
+
+#define GC_BACKOFF_MIN 4
+#define GC_BACKOFF_MAX 12
+
+void gc_mark_backoff(int *i)
+{
+    if (*i < GC_BACKOFF_MAX) {
+        (*i)++;
+    }
+    for (int j = 0; j < (1 << *i); j++) {
+        jl_cpu_pause();
+    }
+}
+
+void gc_mark_loop_parallel(jl_ptls_t ptls, int master)
+{
+    int backoff = GC_BACKOFF_MIN;
+    if (master) {
+        jl_atomic_store(&gc_master_tid, ptls->tid);
+        // Wake threads up and try to do some work
+        uv_mutex_lock(&gc_threads_lock);
+        jl_atomic_fetch_add(&gc_n_threads_marking, 1);
+        uv_cond_broadcast(&gc_threads_cond);
+        uv_mutex_unlock(&gc_threads_lock);
+        gc_mark_and_steal(ptls);
+        jl_atomic_fetch_add(&gc_n_threads_marking, -1);
+    }
+    while (jl_atomic_load(&gc_n_threads_marking) > 0) {
+        // Try to become a thief while other threads are marking
+        jl_atomic_fetch_add(&gc_n_threads_marking, 1);
+        if (jl_atomic_load(&gc_master_tid) != -1) {
+            gc_mark_and_steal(ptls);
+        }
+        jl_atomic_fetch_add(&gc_n_threads_marking, -1);
+        // Failed to steal
+        gc_mark_backoff(&backoff);
+    }
+}
+
+void gc_mark_loop(jl_ptls_t ptls)
+{
+    if (jl_n_gcthreads == 0 || gc_heap_snapshot_enabled) {
+        gc_mark_loop_serial(ptls);
+    }
+    else {
+        gc_mark_loop_parallel(ptls, 1);
+    }
+}
+
+void gc_mark_loop_barrier(void)
+{
+    jl_atomic_store(&gc_master_tid, -1);
+    while (jl_atomic_load(&gc_n_threads_marking) != 0) {
+        jl_cpu_pause();
+    }
+}
+
+void gc_mark_clean_reclaim_sets(void)
+{
+    // Clean up `reclaim-sets`
+    for (int i = 0; i < gc_n_threads; i++) {
+        jl_ptls_t ptls2 = gc_all_tls_states[i];
+        arraylist_t *reclaim_set2 = &ptls2->mark_queue.reclaim_set;
+        ws_array_t *a = NULL;
+        while ((a = (ws_array_t *)arraylist_pop(reclaim_set2)) != NULL) {
+            free(a->buffer);
+            free(a);
+        }
+    }
+}
+
+static void gc_premark(jl_ptls_t ptls2)
+{
+    arraylist_t *remset = ptls2->heap.remset;
+    ptls2->heap.remset = ptls2->heap.last_remset;
+    ptls2->heap.last_remset = remset;
+    ptls2->heap.remset->len = 0;
+    ptls2->heap.remset_nptr = 0;
+    // avoid counting remembered objects & bindings twice
+    // in `perm_scanned_bytes`
+    size_t len = remset->len;
+    void **items = remset->items;
+    for (size_t i = 0; i < len; i++) {
+        jl_value_t *item = (jl_value_t*)items[i];
+        objprofile_count(jl_typeof(item), 2, 0);
+        jl_astaggedvalue(item)->bits.gc = GC_OLD_MARKED;
+    }
+    len = ptls2->heap.rem_bindings.len;
+    items = ptls2->heap.rem_bindings.items;
+    for (size_t i = 0; i < len; i++) {
+        void *ptr = items[i];
+        jl_astaggedvalue(ptr)->bits.gc = GC_OLD_MARKED;
     }
 }
 
-static void jl_gc_queue_thread_local(jl_gc_mark_cache_t *gc_cache, jl_gc_mark_sp_t *sp,
-                                     jl_ptls_t ptls2)
+static void gc_queue_thread_local(jl_gc_markqueue_t *mq, jl_ptls_t ptls2)
 {
     jl_task_t *task;
     task = ptls2->root_task;
-    if (task) {
-        gc_mark_queue_obj(gc_cache, sp, task);
+    if (task != NULL) {
+        gc_try_claim_and_push(mq, task, NULL);
         gc_heap_snapshot_record_root((jl_value_t*)task, "root task");
     }
     task = jl_atomic_load_relaxed(&ptls2->current_task);
-    if (task) {
-        gc_mark_queue_obj(gc_cache, sp, task);
+    if (task != NULL) {
+        gc_try_claim_and_push(mq, task, NULL);
         gc_heap_snapshot_record_root((jl_value_t*)task, "current task");
     }
     task = ptls2->next_task;
-    if (task) {
-        gc_mark_queue_obj(gc_cache, sp, task);
+    if (task != NULL) {
+        gc_try_claim_and_push(mq, task, NULL);
         gc_heap_snapshot_record_root((jl_value_t*)task, "next task");
     }
     task = ptls2->previous_task;
-    if (task) { // shouldn't be necessary, but no reason not to
-        gc_mark_queue_obj(gc_cache, sp, task);
+    if (task != NULL) {
+        gc_try_claim_and_push(mq, task, NULL);
         gc_heap_snapshot_record_root((jl_value_t*)task, "previous task");
     }
     if (ptls2->previous_exception) {
-        gc_mark_queue_obj(gc_cache, sp, ptls2->previous_exception);
+        gc_try_claim_and_push(mq, ptls2->previous_exception, NULL);
         gc_heap_snapshot_record_root((jl_value_t*)ptls2->previous_exception, "previous exception");
     }
 }
 
+static void gc_queue_bt_buf(jl_gc_markqueue_t *mq, jl_ptls_t ptls2)
+{
+    jl_bt_element_t *bt_data = ptls2->bt_data;
+    size_t bt_size = ptls2->bt_size;
+    for (size_t i = 0; i < bt_size; i += jl_bt_entry_size(bt_data + i)) {
+        jl_bt_element_t *bt_entry = bt_data + i;
+        if (jl_bt_is_native(bt_entry))
+            continue;
+        size_t njlvals = jl_bt_num_jlvals(bt_entry);
+        for (size_t j = 0; j < njlvals; j++)
+            gc_try_claim_and_push(mq, jl_bt_entry_jlvalue(bt_entry, j), NULL);
+    }
+}
+
+static void gc_queue_remset(jl_ptls_t ptls, jl_ptls_t ptls2)
+{
+    size_t len = ptls2->heap.last_remset->len;
+    void **items = ptls2->heap.last_remset->items;
+    for (size_t i = 0; i < len; i++) {
+        // Objects in the `remset` are already marked,
+        // so a `gc_try_claim_and_push` wouldn't work here
+        gc_mark_outrefs(ptls, &ptls->mark_queue, (jl_value_t *)items[i], 1);
+    }
+    int n_bnd_refyoung = 0;
+    len = ptls2->heap.rem_bindings.len;
+    items = ptls2->heap.rem_bindings.items;
+    for (size_t i = 0; i < len; i++) {
+        jl_binding_t *ptr = (jl_binding_t*)items[i];
+        uintptr_t bnd_refyoung = 0;
+        jl_value_t *v = jl_atomic_load_relaxed(&ptr->value);
+        gc_try_claim_and_push(&ptls->mark_queue, v, &bnd_refyoung);
+        jl_value_t *ty = jl_atomic_load_relaxed(&ptr->ty);
+        gc_try_claim_and_push(&ptls->mark_queue, ty, &bnd_refyoung);
+        jl_value_t *globalref = jl_atomic_load_relaxed(&ptr->globalref);
+        gc_try_claim_and_push(&ptls->mark_queue, globalref, &bnd_refyoung);
+        if (bnd_refyoung) {
+            items[n_bnd_refyoung] = ptr;
+            n_bnd_refyoung++;
+        }
+    }
+    ptls2->heap.rem_bindings.len = n_bnd_refyoung;
+}
+
 extern jl_value_t *cmpswap_names JL_GLOBALLY_ROOTED;
 
 // mark the initial root set
-static void mark_roots(jl_gc_mark_cache_t *gc_cache, jl_gc_mark_sp_t *sp)
+static void gc_mark_roots(jl_gc_markqueue_t *mq)
 {
     // modules
-    gc_mark_queue_obj(gc_cache, sp, jl_main_module);
+    gc_try_claim_and_push(mq, jl_main_module, NULL);
     gc_heap_snapshot_record_root((jl_value_t*)jl_main_module, "main_module");
-
     // invisible builtin values
-    if (jl_an_empty_vec_any != NULL)
-        gc_mark_queue_obj(gc_cache, sp, jl_an_empty_vec_any);
-    if (jl_module_init_order != NULL)
-        gc_mark_queue_obj(gc_cache, sp, jl_module_init_order);
+    gc_try_claim_and_push(mq, jl_an_empty_vec_any, NULL);
+    gc_try_claim_and_push(mq, jl_module_init_order, NULL);
     for (size_t i = 0; i < jl_current_modules.size; i += 2) {
         if (jl_current_modules.table[i + 1] != HT_NOTFOUND) {
-            gc_mark_queue_obj(gc_cache, sp, jl_current_modules.table[i]);
+            gc_try_claim_and_push(mq, jl_current_modules.table[i], NULL);
             gc_heap_snapshot_record_root((jl_value_t*)jl_current_modules.table[i], "top level module");
         }
     }
-    gc_mark_queue_obj(gc_cache, sp, jl_anytuple_type_type);
+    gc_try_claim_and_push(mq, jl_anytuple_type_type, NULL);
     for (size_t i = 0; i < N_CALL_CACHE; i++) {
         jl_typemap_entry_t *v = jl_atomic_load_relaxed(&call_cache[i]);
-        if (v != NULL) {
-            gc_mark_queue_obj(gc_cache, sp, v);
-        }
+        gc_try_claim_and_push(mq, v, NULL);
     }
-    if (jl_all_methods != NULL) {
-        gc_mark_queue_obj(gc_cache, sp, jl_all_methods);
-    }
-    if (_jl_debug_method_invalidation != NULL)
-        gc_mark_queue_obj(gc_cache, sp, _jl_debug_method_invalidation);
-
+    gc_try_claim_and_push(mq, jl_all_methods, NULL);
+    gc_try_claim_and_push(mq, _jl_debug_method_invalidation, NULL);
     // constants
-    gc_mark_queue_obj(gc_cache, sp, jl_emptytuple_type);
-    if (cmpswap_names != NULL)
-        gc_mark_queue_obj(gc_cache, sp, cmpswap_names);
-    gc_mark_queue_obj(gc_cache, sp, jl_global_roots_table);
+    gc_try_claim_and_push(mq, jl_emptytuple_type, NULL);
+    gc_try_claim_and_push(mq, cmpswap_names, NULL);
+    gc_try_claim_and_push(mq, jl_global_roots_table, NULL);
 }
 
 // find unmarked objects that need to be finalized from the finalizer list "list".
@@ -3310,76 +3245,6 @@ JL_DLLEXPORT int64_t jl_gc_live_bytes(void)
     return live_bytes;
 }
 
-static void jl_gc_premark(jl_ptls_t ptls2)
-{
-    arraylist_t *remset = ptls2->heap.remset;
-    ptls2->heap.remset = ptls2->heap.last_remset;
-    ptls2->heap.last_remset = remset;
-    ptls2->heap.remset->len = 0;
-    ptls2->heap.remset_nptr = 0;
-
-    // avoid counting remembered objects & bindings twice
-    // in `perm_scanned_bytes`
-    size_t len = remset->len;
-    void **items = remset->items;
-    for (size_t i = 0; i < len; i++) {
-        jl_value_t *item = (jl_value_t*)items[i];
-        objprofile_count(jl_typeof(item), 2, 0);
-        jl_astaggedvalue(item)->bits.gc = GC_OLD_MARKED;
-    }
-    len = ptls2->heap.rem_bindings.len;
-    items = ptls2->heap.rem_bindings.items;
-    for (size_t i = 0; i < len; i++) {
-        void *ptr = items[i];
-        jl_astaggedvalue(ptr)->bits.gc = GC_OLD_MARKED;
-    }
-}
-
-static void jl_gc_queue_remset(jl_gc_mark_cache_t *gc_cache, jl_gc_mark_sp_t *sp, jl_ptls_t ptls2)
-{
-    size_t len = ptls2->heap.last_remset->len;
-    void **items = ptls2->heap.last_remset->items;
-    for (size_t i = 0; i < len; i++)
-        gc_mark_queue_scan_obj(gc_cache, sp, (jl_value_t*)items[i]);
-    int n_bnd_refyoung = 0;
-    len = ptls2->heap.rem_bindings.len;
-    items = ptls2->heap.rem_bindings.items;
-    for (size_t i = 0; i < len; i++) {
-        jl_binding_t *ptr = (jl_binding_t*)items[i];
-        // A null pointer can happen here when the binding is cleaned up
-        // as an exception is thrown after it was already queued (#10221)
-        int bnd_refyoung = 0;
-        jl_value_t *v = jl_atomic_load_relaxed(&ptr->value);
-        if (v != NULL && gc_mark_queue_obj(gc_cache, sp, v))
-            bnd_refyoung = 1;
-        jl_value_t *ty = jl_atomic_load_relaxed(&ptr->ty);
-        if (ty != NULL && gc_mark_queue_obj(gc_cache, sp, ty))
-            bnd_refyoung = 1;
-        jl_value_t *globalref = jl_atomic_load_relaxed(&ptr->globalref);
-        if (globalref != NULL && gc_mark_queue_obj(gc_cache, sp, globalref))
-            bnd_refyoung = 1;
-        if (bnd_refyoung) {
-            items[n_bnd_refyoung] = ptr;
-            n_bnd_refyoung++;
-        }
-    }
-    ptls2->heap.rem_bindings.len = n_bnd_refyoung;
-}
-
-static void jl_gc_queue_bt_buf(jl_gc_mark_cache_t *gc_cache, jl_gc_mark_sp_t *sp, jl_ptls_t ptls2)
-{
-    jl_bt_element_t *bt_data = ptls2->bt_data;
-    size_t bt_size = ptls2->bt_size;
-    for (size_t i = 0; i < bt_size; i += jl_bt_entry_size(bt_data + i)) {
-        jl_bt_element_t *bt_entry = bt_data + i;
-        if (jl_bt_is_native(bt_entry))
-            continue;
-        size_t njlvals = jl_bt_num_jlvals(bt_entry);
-        for (size_t j = 0; j < njlvals; j++)
-            gc_mark_queue_obj(gc_cache, sp, jl_bt_entry_jlvalue(bt_entry, j));
-    }
-}
-
 size_t jl_maxrss(void);
 
 // Only one thread should be running in this function
@@ -3387,9 +3252,7 @@ static int _jl_gc_collect(jl_ptls_t ptls, jl_gc_collection_t collection)
 {
     combine_thread_gc_counts(&gc_num);
 
-    jl_gc_mark_cache_t *gc_cache = &ptls->gc_cache;
-    jl_gc_mark_sp_t sp;
-    gc_mark_sp_init(gc_cache, &sp);
+    jl_gc_markqueue_t *mq = &ptls->mark_queue;
 
     uint64_t gc_start_time = jl_hrtime();
     int64_t last_perm_scanned_bytes = perm_scanned_bytes;
@@ -3401,33 +3264,39 @@ static int _jl_gc_collect(jl_ptls_t ptls, jl_gc_collection_t collection)
     for (int t_i = 0; t_i < gc_n_threads; t_i++) {
         jl_ptls_t ptls2 = gc_all_tls_states[t_i];
         if (ptls2 != NULL)
-            jl_gc_premark(ptls2);
+            gc_premark(ptls2);
     }
 
     assert(gc_n_threads);
+    int single_threaded = (jl_n_gcthreads == 0 || gc_heap_snapshot_enabled);
     for (int t_i = 0; t_i < gc_n_threads; t_i++) {
         jl_ptls_t ptls2 = gc_all_tls_states[t_i];
-        if (ptls2 == NULL)
-            continue;
-        // 2.1. mark every object in the `last_remsets` and `rem_binding`
-        jl_gc_queue_remset(gc_cache, &sp, ptls2);
-        // 2.2. mark every thread local root
-        jl_gc_queue_thread_local(gc_cache, &sp, ptls2);
-        // 2.3. mark any managed objects in the backtrace buffer
-        // TODO: treat these as roots for gc_heap_snapshot_record
-        jl_gc_queue_bt_buf(gc_cache, &sp, ptls2);
+        jl_gc_markqueue_t *mq2 = mq;
+        jl_ptls_t ptls_gc_thread = NULL;
+        if (!single_threaded) {
+            ptls_gc_thread = gc_all_tls_states[gc_first_tid + t_i % jl_n_gcthreads];
+            mq2 = &ptls_gc_thread->mark_queue;
+        }
+        if (ptls2 != NULL) {
+            // 2.1. mark every thread local root
+            gc_queue_thread_local(mq2, ptls2);
+            // 2.2. mark any managed objects in the backtrace buffer
+            // TODO: treat these as roots for gc_heap_snapshot_record
+            gc_queue_bt_buf(mq2, ptls2);
+            // 2.3. mark every object in the `last_remsets` and `rem_binding`
+            gc_queue_remset(single_threaded ? ptls : ptls_gc_thread, ptls2);
+        }
     }
 
     // 3. walk roots
-    mark_roots(gc_cache, &sp);
+    gc_mark_roots(mq);
     if (gc_cblist_root_scanner) {
-        export_gc_state(ptls, &sp);
         gc_invoke_callbacks(jl_gc_cb_root_scanner_t,
             gc_cblist_root_scanner, (collection));
-        import_gc_state(ptls, &sp);
     }
-    gc_mark_loop(ptls, sp);
-    gc_mark_sp_init(gc_cache, &sp);
+    gc_mark_loop(ptls);
+    gc_mark_loop_barrier();
+    gc_mark_clean_reclaim_sets();
     gc_num.since_sweep += gc_num.allocd;
     JL_PROBE_GC_MARK_END(scanned_bytes, perm_scanned_bytes);
     gc_settime_premark_end();
@@ -3448,9 +3317,8 @@ static int _jl_gc_collect(jl_ptls_t ptls, jl_gc_collection_t collection)
     assert(gc_n_threads);
     for (int i = 0; i < gc_n_threads; i++) {
         jl_ptls_t ptls2 = gc_all_tls_states[i];
-        if (ptls2 == NULL)
-            continue;
-        sweep_finalizer_list(&ptls2->finalizers);
+        if (ptls2 != NULL)
+            sweep_finalizer_list(&ptls2->finalizers);
     }
     if (prev_sweep_full) {
         sweep_finalizer_list(&finalizer_list_marked);
@@ -3459,15 +3327,13 @@ static int _jl_gc_collect(jl_ptls_t ptls, jl_gc_collection_t collection)
     assert(gc_n_threads);
     for (int i = 0; i < gc_n_threads; i++) {
         jl_ptls_t ptls2 = gc_all_tls_states[i];
-        if (ptls2 == NULL)
-            continue;
-        gc_mark_queue_finlist(gc_cache, &sp, &ptls2->finalizers, 0);
+        if (ptls2 != NULL)
+            gc_mark_finlist(mq, &ptls2->finalizers, 0);
     }
-    gc_mark_queue_finlist(gc_cache, &sp, &finalizer_list_marked, orig_marked_len);
+    gc_mark_finlist(mq, &finalizer_list_marked, orig_marked_len);
     // "Flush" the mark stack before flipping the reset_age bit
     // so that the objects are not incorrectly reset.
-    gc_mark_loop(ptls, sp);
-    gc_mark_sp_init(gc_cache, &sp);
+    gc_mark_loop_serial(ptls);
     // Conservative marking relies on age to tell allocated objects
     // and freelist entries apart.
     mark_reset_age = !jl_gc_conservative_gc_support_enabled();
@@ -3475,9 +3341,10 @@ static int _jl_gc_collect(jl_ptls_t ptls, jl_gc_collection_t collection)
     // `to_finalize` list. These objects are only reachable from this list
     // and should not be referenced by any old objects so this won't break
     // the GC invariant.
-    gc_mark_queue_finlist(gc_cache, &sp, &to_finalize, 0);
-    gc_mark_loop(ptls, sp);
+    gc_mark_finlist(mq, &to_finalize, 0);
+    gc_mark_loop_serial(ptls);
     mark_reset_age = 0;
+
     gc_settime_postmark_end();
 
     // Flush everything in mark cache
@@ -3502,9 +3369,8 @@ static int _jl_gc_collect(jl_ptls_t ptls, jl_gc_collection_t collection)
     assert(gc_n_threads);
     for (int i = 0; i < gc_n_threads; i++) {
         jl_ptls_t ptls2 = gc_all_tls_states[i];
-        if (ptls2 == NULL)
-            continue;
-        nptr += ptls2->heap.remset_nptr;
+        if (ptls2 != NULL)
+            nptr += ptls2->heap.remset_nptr;
     }
 
     // many pointers in the intergen frontier => "quick" mark is not quick
@@ -3525,7 +3391,7 @@ static int _jl_gc_collect(jl_ptls_t ptls, jl_gc_collection_t collection)
         size_t maxmem = 0;
 #ifdef _P64
         // on a big memory machine, increase max_collect_interval to totalmem / nthreads / 2
-        maxmem = total_mem / gc_n_threads / 2;
+        maxmem = total_mem / (gc_n_threads - jl_n_gcthreads) / 2;
 #endif
         if (maxmem < max_collect_interval)
             maxmem = max_collect_interval;
@@ -3557,8 +3423,8 @@ static int _jl_gc_collect(jl_ptls_t ptls, jl_gc_collection_t collection)
         last_long_collect_interval = gc_num.interval;
     }
     scanned_bytes = 0;
-    // 5. start sweeping
     pool_live_bytes = 0;
+    // 6. start sweeping
     uint64_t start_sweep_time = jl_hrtime();
     JL_PROBE_GC_SWEEP_BEGIN(sweep_full);
     sweep_weak_refs();
@@ -3585,7 +3451,7 @@ static int _jl_gc_collect(jl_ptls_t ptls, jl_gc_collection_t collection)
     }
 
     // sweeping is over
-    // 6. if it is a quick sweep, put back the remembered objects in queued state
+    // 7. if it is a quick sweep, put back the remembered objects in queued state
     // so that we don't trigger the barrier again on them.
     assert(gc_n_threads);
     for (int t_i = 0; t_i < gc_n_threads; t_i++) {
@@ -3619,7 +3485,6 @@ static int _jl_gc_collect(jl_ptls_t ptls, jl_gc_collection_t collection)
     }
 #endif
 
-
     _report_gc_finished(pause, gc_num.freed, sweep_full, recollect);
 
     gc_final_pause_end(gc_start_time, gc_end_time);
@@ -3637,19 +3502,19 @@ static int _jl_gc_collect(jl_ptls_t ptls, jl_gc_collection_t collection)
 
     if (collection == JL_GC_AUTO) {
         //If we aren't freeing enough or are seeing lots and lots of pointers let it increase faster
-        if(!not_freed_enough || large_frontier) {
+        if (!not_freed_enough || large_frontier) {
             int64_t tot = 2 * (live_bytes + gc_num.since_sweep) / 3;
             if (gc_num.interval > tot) {
                 gc_num.interval = tot;
                 last_long_collect_interval = tot;
             }
+        }
         // If the current interval is larger than half the live data decrease the interval
-        } else {
+        else {
             int64_t half = (live_bytes / 2);
             if (gc_num.interval > half)
                 gc_num.interval = half;
         }
-
         // But never go below default
         if (gc_num.interval < default_collect_interval) gc_num.interval = default_collect_interval;
     }
@@ -3771,16 +3636,15 @@ JL_DLLEXPORT void jl_gc_collect(jl_gc_collection_t collection)
     errno = last_errno;
 }
 
-void gc_mark_queue_all_roots(jl_ptls_t ptls, jl_gc_mark_sp_t *sp)
+void gc_mark_queue_all_roots(jl_ptls_t ptls, jl_gc_markqueue_t *mq)
 {
-    jl_gc_mark_cache_t *gc_cache = &ptls->gc_cache;
     assert(gc_n_threads);
     for (size_t i = 0; i < gc_n_threads; i++) {
         jl_ptls_t ptls2 = gc_all_tls_states[i];
-        if (ptls2)
-            jl_gc_queue_thread_local(gc_cache, sp, ptls2);
+        if (ptls2 != NULL)
+            gc_queue_thread_local(mq, ptls2);
     }
-    mark_roots(gc_cache, sp);
+    gc_mark_roots(mq);
 }
 
 // allocator entry points
@@ -3817,10 +3681,20 @@ void jl_init_thread_heap(jl_ptls_t ptls)
     gc_cache->perm_scanned_bytes = 0;
     gc_cache->scanned_bytes = 0;
     gc_cache->nbig_obj = 0;
-    size_t init_size = 1024;
-    gc_cache->pc_stack = (void**)malloc_s(init_size * sizeof(void*));
-    gc_cache->pc_stack_end = gc_cache->pc_stack + init_size;
-    gc_cache->data_stack = (jl_gc_mark_data_t *)malloc_s(init_size * sizeof(jl_gc_mark_data_t));
+
+    // Initialize GC mark-queue
+    jl_gc_markqueue_t *mq = &ptls->mark_queue;
+    ws_queue_t *cq = &mq->chunk_queue;
+    ws_array_t *wsa = create_ws_array(GC_CHUNK_QUEUE_INIT_SIZE, sizeof(jl_gc_chunk_t));
+    jl_atomic_store_relaxed(&cq->top, 0);
+    jl_atomic_store_relaxed(&cq->bottom, 0);
+    jl_atomic_store_relaxed(&cq->array, wsa);
+    ws_queue_t *q = &mq->ptr_queue;
+    ws_array_t *wsa2 = create_ws_array(GC_PTR_QUEUE_INIT_SIZE, sizeof(jl_value_t *));
+    jl_atomic_store_relaxed(&q->top, 0);
+    jl_atomic_store_relaxed(&q->bottom, 0);
+    jl_atomic_store_relaxed(&q->array, wsa2);
+    arraylist_new(&mq->reclaim_set, 32);
 
     memset(&ptls->gc_num, 0, sizeof(ptls->gc_num));
     jl_atomic_store_relaxed(&ptls->gc_num.allocd, -(int64_t)gc_num.interval);
@@ -3833,6 +3707,8 @@ void jl_gc_init(void)
     JL_MUTEX_INIT(&finalizers_lock);
     uv_mutex_init(&gc_cache_lock);
     uv_mutex_init(&gc_perm_lock);
+    uv_mutex_init(&gc_threads_lock);
+    uv_cond_init(&gc_threads_cond);
 
     jl_gc_init_page();
     jl_gc_debug_init();
@@ -3864,9 +3740,6 @@ void jl_gc_init(void)
 #endif
     if (jl_options.heap_size_hint)
         jl_gc_set_max_memory(jl_options.heap_size_hint);
-
-    jl_gc_mark_sp_t sp = {NULL, NULL, NULL, NULL};
-    gc_mark_loop(NULL, sp);
     t_start = jl_hrtime();
 }
 
@@ -3894,7 +3767,7 @@ JL_DLLEXPORT void *jl_gc_counted_malloc(size_t sz)
 {
     jl_gcframe_t **pgcstack = jl_get_pgcstack();
     jl_task_t *ct = jl_current_task;
-    if (pgcstack && ct->world_age) {
+    if (pgcstack != NULL && ct->world_age) {
         jl_ptls_t ptls = ct->ptls;
         maybe_collect(ptls);
         jl_atomic_store_relaxed(&ptls->gc_num.allocd,
@@ -3909,7 +3782,7 @@ JL_DLLEXPORT void *jl_gc_counted_calloc(size_t nm, size_t sz)
 {
     jl_gcframe_t **pgcstack = jl_get_pgcstack();
     jl_task_t *ct = jl_current_task;
-    if (pgcstack && ct->world_age) {
+    if (pgcstack != NULL && ct->world_age) {
         jl_ptls_t ptls = ct->ptls;
         maybe_collect(ptls);
         jl_atomic_store_relaxed(&ptls->gc_num.allocd,
@@ -3925,7 +3798,7 @@ JL_DLLEXPORT void jl_gc_counted_free_with_size(void *p, size_t sz)
     jl_gcframe_t **pgcstack = jl_get_pgcstack();
     jl_task_t *ct = jl_current_task;
     free(p);
-    if (pgcstack && ct->world_age) {
+    if (pgcstack != NULL && ct->world_age) {
         jl_ptls_t ptls = ct->ptls;
         jl_atomic_store_relaxed(&ptls->gc_num.freed,
             jl_atomic_load_relaxed(&ptls->gc_num.freed) + sz);
@@ -3938,7 +3811,7 @@ JL_DLLEXPORT void *jl_gc_counted_realloc_with_old_size(void *p, size_t old, size
 {
     jl_gcframe_t **pgcstack = jl_get_pgcstack();
     jl_task_t *ct = jl_current_task;
-    if (pgcstack && ct->world_age) {
+    if (pgcstack != NULL && ct->world_age) {
         jl_ptls_t ptls = ct->ptls;
         maybe_collect(ptls);
         if (sz < old)
diff --git a/src/gc.h b/src/gc.h
index cf6580bd69df3..adc89e4769dfc 100644
--- a/src/gc.h
+++ b/src/gc.h
@@ -42,7 +42,6 @@ extern "C" {
 typedef struct {
     uint64_t num;
     uint64_t next;
-
     uint64_t min;
     uint64_t interv;
     uint64_t max;
@@ -86,163 +85,33 @@ typedef struct {
     uint64_t    last_incremental_sweep;
 } jl_gc_num_t;
 
-enum {
-    GC_MARK_L_marked_obj,
-    GC_MARK_L_scan_only,
-    GC_MARK_L_finlist,
-    GC_MARK_L_objarray,
-    GC_MARK_L_array8,
-    GC_MARK_L_array16,
-    GC_MARK_L_obj8,
-    GC_MARK_L_obj16,
-    GC_MARK_L_obj32,
-    GC_MARK_L_stack,
-    GC_MARK_L_excstack,
-    GC_MARK_L_module_binding,
-    _GC_MARK_L_MAX
-};
-
-// The following structs (`gc_mark_*_t`) contain iterator state used for the
-// scanning of various object types.
-//
-// The `nptr` member records the number of pointers slots referenced by
-// an object to be used in the full collection heuristics as well as whether the object
-// references young objects.
-// `nptr >> 2` is the number of pointers fields referenced by the object.
-// The lowest bit of `nptr` is set if the object references young object.
-// The 2nd lowest bit of `nptr` is the GC old bits of the object after marking.
-// A `0x3` in the low bits means that the object needs to be in the remset.
-
-// An generic object that's marked and needs to be scanned
-// The metadata might need update too (depend on the PC)
-typedef struct {
-    jl_value_t *obj; // The object
-    uintptr_t tag; // The tag with the GC bits masked out
-    uint8_t bits; // The GC bits after tagging (`bits & 1 == 1`)
-} gc_mark_marked_obj_t;
-
-// An object array. This can come from an array, svec, or the using array or a module
-typedef struct {
-    jl_value_t *parent; // The parent object to trigger write barrier on.
-    jl_value_t **begin; // The first slot to be scanned.
-    jl_value_t **end; // The end address (after the last slot to be scanned)
-    uint32_t step; // Number of pointers to jump between marks
-    uintptr_t nptr; // See notes about `nptr` above.
-} gc_mark_objarray_t;
-
-// A normal object with 8bits field descriptors
-typedef struct {
-    jl_value_t *parent; // The parent object to trigger write barrier on.
-    uint8_t *begin; // Current field descriptor.
-    uint8_t *end; // End of field descriptor.
-    uintptr_t nptr; // See notes about `nptr` above.
-} gc_mark_obj8_t;
-
-// A normal object with 16bits field descriptors
-typedef struct {
-    jl_value_t *parent; // The parent object to trigger write barrier on.
-    uint16_t *begin; // Current field descriptor.
-    uint16_t *end; // End of field descriptor.
-    uintptr_t nptr; // See notes about `nptr` above.
-} gc_mark_obj16_t;
-
-// A normal object with 32bits field descriptors
-typedef struct {
-    jl_value_t *parent; // The parent object to trigger write barrier on.
-    uint32_t *begin; // Current field descriptor.
-    uint32_t *end; // End of field descriptor.
-    uintptr_t nptr; // See notes about `nptr` above.
-} gc_mark_obj32_t;
-
-typedef struct {
-    jl_value_t **begin; // The first slot to be scanned.
-    jl_value_t **end; // The end address (after the last slot to be scanned)
-    uint8_t *rebegin;
-    gc_mark_obj8_t elem;
-} gc_mark_array8_t;
-
-typedef struct {
-    jl_value_t **begin; // The first slot to be scanned.
-    jl_value_t **end; // The end address (after the last slot to be scanned)
-    uint16_t *rebegin;
-    gc_mark_obj16_t elem;
-} gc_mark_array16_t;
-
-// Stack frame
-typedef struct {
-    jl_gcframe_t *s; // The current stack frame
-    uint32_t i; // The current slot index in the frame
-    uint32_t nroots; // `nroots` fields in the frame
-    // Parameters to mark the copy_stack range.
-    uintptr_t offset;
-    uintptr_t lb;
-    uintptr_t ub;
-} gc_mark_stackframe_t;
-
-// Exception stack data
-typedef struct {
-    jl_excstack_t *s;   // Stack of exceptions
-    size_t itr;         // Iterator into exception stack
-    size_t bt_index;    // Current backtrace buffer entry index
-    size_t jlval_index; // Index into GC managed values for current bt entry
-} gc_mark_excstack_t;
-
-// Module bindings. This is also the beginning of module scanning.
-// The loop will start marking other references in a module after the bindings are marked
-typedef struct {
-    jl_module_t *parent; // The parent module to trigger write barrier on.
-    jl_binding_t **begin; // The first slot to be scanned.
-    jl_binding_t **end; // The end address (after the last slot to be scanned)
-    uintptr_t nptr; // See notes about `nptr` above.
-    uint8_t bits; // GC bits of the module (the bits to mark the binding buffer with)
-} gc_mark_binding_t;
-
-// Finalizer (or object) list
-typedef struct {
-    jl_value_t **begin;
-    jl_value_t **end;
-} gc_mark_finlist_t;
-
-// This is used to determine the max size of the data objects on the data stack.
-// We'll use this size to determine the size of the data stack corresponding to a
-// PC stack size. Since the data objects are not all of the same size, we'll waste
-// some memory on the data stack this way but that size is unlikely going to be significant.
-union _jl_gc_mark_data {
-    gc_mark_marked_obj_t marked;
-    gc_mark_objarray_t objarray;
-    gc_mark_array8_t array8;
-    gc_mark_array16_t array16;
-    gc_mark_obj8_t obj8;
-    gc_mark_obj16_t obj16;
-    gc_mark_obj32_t obj32;
-    gc_mark_stackframe_t stackframe;
-    gc_mark_excstack_t excstackframe;
-    gc_mark_binding_t binding;
-    gc_mark_finlist_t finlist;
-};
-
-// Pop a data struct from the mark data stack (i.e. decrease the stack pointer)
-// This should be used after dispatch and therefore the pc stack pointer is already popped from
-// the stack.
-STATIC_INLINE void *gc_pop_markdata_(jl_gc_mark_sp_t *sp, size_t size)
-{
-    jl_gc_mark_data_t *data = (jl_gc_mark_data_t *)(((char*)sp->data) - size);
-    sp->data = data;
-    return data;
-}
-#define gc_pop_markdata(sp, type) ((type*)gc_pop_markdata_(sp, sizeof(type)))
-
-// Re-push a frame to the mark stack (both data and pc)
-// The data and pc are expected to be on the stack (or updated in place) already.
-// Mainly useful to pause the current scanning in order to scan an new object.
-STATIC_INLINE void *gc_repush_markdata_(jl_gc_mark_sp_t *sp, size_t size) JL_NOTSAFEPOINT
-{
-    jl_gc_mark_data_t *data = sp->data;
-    sp->pc++;
-    sp->data = (jl_gc_mark_data_t *)(((char*)sp->data) + size);
-    return data;
-}
-#define gc_repush_markdata(sp, type) ((type*)gc_repush_markdata_(sp, sizeof(type)))
+// Array chunks (work items representing suffixes of
+// large arrays of pointers left to be marked)
+
+typedef enum {
+    GC_empty_chunk = 0, // for sentinel representing no items left in chunk queue
+    GC_objary_chunk,    // for chunk of object array
+    GC_ary8_chunk,      // for chunk of array with 8 bit field descriptors
+    GC_ary16_chunk,     // for chunk of array with 16 bit field descriptors
+    GC_finlist_chunk,   // for chunk of finalizer list
+} gc_chunk_id_t;
+
+typedef struct _jl_gc_chunk_t {
+    gc_chunk_id_t cid;
+    struct _jl_value_t *parent; // array owner
+    struct _jl_value_t **begin; // pointer to first element that needs scanning
+    struct _jl_value_t **end;   // pointer to last element that needs scanning
+    void *elem_begin;           // used to scan pointers within objects when marking `ary8` or `ary16`
+    void *elem_end;             // used to scan pointers within objects when marking `ary8` or `ary16`
+    uint32_t step;              // step-size used when marking objarray
+    uintptr_t nptr;             // (`nptr` & 0x1) if array has young element and (`nptr` & 0x2) if array owner is old
+} jl_gc_chunk_t;
+
+#define GC_CHUNK_BATCH_SIZE (1 << 16)       // maximum number of references that can be processed
+                                            // without creating a chunk
+
+#define GC_PTR_QUEUE_INIT_SIZE (1 << 18)    // initial size of queue of `jl_value_t *`
+#define GC_CHUNK_QUEUE_INIT_SIZE (1 << 14)  // initial size of chunk-queue
 
 // layout for big (>2k) objects
 
@@ -511,23 +380,16 @@ STATIC_INLINE void gc_big_object_link(bigval_t *hdr, bigval_t **list) JL_NOTSAFE
     *list = hdr;
 }
 
-STATIC_INLINE void gc_mark_sp_init(jl_gc_mark_cache_t *gc_cache, jl_gc_mark_sp_t *sp)
-{
-    sp->pc = gc_cache->pc_stack;
-    sp->data = gc_cache->data_stack;
-    sp->pc_start = gc_cache->pc_stack;
-    sp->pc_end = gc_cache->pc_stack_end;
-}
-
-void gc_mark_queue_all_roots(jl_ptls_t ptls, jl_gc_mark_sp_t *sp);
-void gc_mark_queue_finlist(jl_gc_mark_cache_t *gc_cache, jl_gc_mark_sp_t *sp,
-                           arraylist_t *list, size_t start);
-void gc_mark_loop(jl_ptls_t ptls, jl_gc_mark_sp_t sp);
+void gc_mark_queue_all_roots(jl_ptls_t ptls, jl_gc_markqueue_t *mq);
+void gc_mark_finlist_(jl_gc_markqueue_t *mq, jl_value_t **fl_begin,
+                                    jl_value_t **fl_end) JL_NOTSAFEPOINT;
+void gc_mark_finlist(jl_gc_markqueue_t *mq, arraylist_t *list,
+                                   size_t start) JL_NOTSAFEPOINT;
+void gc_mark_loop_serial_(jl_ptls_t ptls, jl_gc_markqueue_t *mq);
+void gc_mark_loop_serial(jl_ptls_t ptls);
 void sweep_stack_pools(void);
 void jl_gc_debug_init(void);
 
-extern void *gc_mark_label_addrs[_GC_MARK_L_MAX];
-
 // GC pages
 
 void jl_gc_init_page(void);
@@ -612,7 +474,6 @@ static inline void gc_verify_tags(void)
 }
 #endif
 
-
 #ifdef GC_VERIFY
 extern jl_value_t *lostval;
 void gc_verify(jl_ptls_t ptls);
@@ -653,10 +514,9 @@ extern int gc_verifying;
 #define gc_verifying (0)
 #endif
 
-
 int gc_slot_to_fieldidx(void *_obj, void *slot, jl_datatype_t *vt) JL_NOTSAFEPOINT;
 int gc_slot_to_arrayidx(void *_obj, void *begin) JL_NOTSAFEPOINT;
-NOINLINE void gc_mark_loop_unwind(jl_ptls_t ptls, jl_gc_mark_sp_t sp, int pc_offset);
+NOINLINE void gc_mark_loop_unwind(jl_ptls_t ptls, jl_gc_markqueue_t *mq, int offset) JL_NOTSAFEPOINT;
 
 #ifdef GC_DEBUG_ENV
 JL_DLLEXPORT extern jl_gc_debug_env_t jl_gc_debug_env;
diff --git a/src/init.c b/src/init.c
index 522f16041d566..0ab6d59e6ec6a 100644
--- a/src/init.c
+++ b/src/init.c
@@ -840,6 +840,7 @@ static NOINLINE void _finish_julia_init(JL_IMAGE_SEARCH rel, jl_ptls_t ptls, jl_
     if (jl_base_module == NULL) {
         // nthreads > 1 requires code in Base
         jl_atomic_store_relaxed(&jl_n_threads, 1);
+        jl_n_gcthreads = 0;
     }
     jl_start_threads();
 
diff --git a/src/jl_exported_data.inc b/src/jl_exported_data.inc
index fee57ed60dd7a..f28ecefbded4a 100644
--- a/src/jl_exported_data.inc
+++ b/src/jl_exported_data.inc
@@ -132,6 +132,7 @@
 #define JL_EXPORTED_DATA_SYMBOLS(XX) \
     XX(jl_n_threadpools, int) \
     XX(jl_n_threads, _Atomic(int)) \
+    XX(jl_n_gcthreads, int) \
     XX(jl_options, jl_options_t) \
     XX(jl_task_gcstack_offset, int) \
     XX(jl_task_ptls_offset, int) \
diff --git a/src/jloptions.c b/src/jloptions.c
index 988078c7c58d9..f325452e19e41 100644
--- a/src/jloptions.c
+++ b/src/jloptions.c
@@ -40,6 +40,7 @@ JL_DLLEXPORT void jl_init_options(void)
                         NULL, // cpu_target ("native", "core2", etc...)
                         0,    // nthreadpools
                         0,    // nthreads
+                        0,    // ngcthreads
                         NULL, // nthreads_per_pool
                         0,    // nprocs
                         NULL, // machine_file
@@ -129,6 +130,7 @@ static const char opts[]  =
     "                           interface if supported (Linux and Windows) or to the number of CPU\n"
     "                           threads if not supported (MacOS) or if process affinity is not\n"
     "                           configured, and sets M to 1.\n"
+    " --gcthreads=N             Use N threads for GC, set to half of the number of compute threads if unspecified.\n"
     " -p, --procs {N|auto}      Integer value N launches N additional local worker processes\n"
     "                           \"auto\" launches as many workers as the number of local CPU threads (logical cores)\n"
     " --machine-file <file>     Run processes on hosts listed in <file>\n\n"
@@ -253,7 +255,8 @@ JL_DLLEXPORT void jl_parse_opts(int *argcp, char ***argvp)
            opt_strip_metadata,
            opt_strip_ir,
            opt_heap_size_hint,
-           opt_permalloc_pkgimg
+           opt_permalloc_pkgimg,
+           opt_gc_threads,
     };
     static const char* const shortopts = "+vhqH:e:E:L:J:C:it:p:O:g:";
     static const struct option longopts[] = {
@@ -278,6 +281,7 @@ JL_DLLEXPORT void jl_parse_opts(int *argcp, char ***argvp)
         { "cpu-target",      required_argument, 0, 'C' },
         { "procs",           required_argument, 0, 'p' },
         { "threads",         required_argument, 0, 't' },
+        { "gcthreads",       required_argument, 0, opt_gc_threads },
         { "machine-file",    required_argument, 0, opt_machine_file },
         { "project",         optional_argument, 0, opt_project },
         { "color",           required_argument, 0, opt_color },
@@ -827,6 +831,13 @@ JL_DLLEXPORT void jl_parse_opts(int *argcp, char ***argvp)
             else
                 jl_errorf("julia: invalid argument to --permalloc-pkgimg={yes|no} (%s)", optarg);
             break;
+        case opt_gc_threads:
+            errno = 0;
+            long ngcthreads = strtol(optarg, &endptr, 10);
+            if (errno != 0 || optarg == endptr || *endptr != 0 || ngcthreads < 1 || ngcthreads >= INT16_MAX)
+                jl_errorf("julia: --gcthreads=<n>; n must be an integer >= 1");
+            jl_options.ngcthreads = (int16_t)ngcthreads;
+            break;
         default:
             jl_errorf("julia: unhandled option -- %c\n"
                       "This is a bug, please report it.", c);
diff --git a/src/jloptions.h b/src/jloptions.h
index 0b72b4c3be062..93f6d321f38d6 100644
--- a/src/jloptions.h
+++ b/src/jloptions.h
@@ -15,6 +15,7 @@ typedef struct {
     const char *cpu_target;
     int8_t nthreadpools;
     int16_t nthreads;
+    int16_t ngcthreads;
     const int16_t *nthreads_per_pool;
     int32_t nprocs;
     const char *machine_file;
diff --git a/src/julia.h b/src/julia.h
index 0a88633905324..caebdf450ed75 100644
--- a/src/julia.h
+++ b/src/julia.h
@@ -1684,6 +1684,7 @@ JL_DLLEXPORT jl_sym_t *jl_get_ARCH(void) JL_NOTSAFEPOINT;
 JL_DLLEXPORT jl_value_t *jl_get_libllvm(void) JL_NOTSAFEPOINT;
 extern JL_DLLIMPORT int jl_n_threadpools;
 extern JL_DLLIMPORT _Atomic(int) jl_n_threads;
+extern JL_DLLIMPORT int jl_n_gcthreads;
 extern JL_DLLIMPORT int *jl_n_threads_per_pool;
 
 // environment entries
diff --git a/src/julia_internal.h b/src/julia_internal.h
index ce5b9409f35be..9ecc97cc64683 100644
--- a/src/julia_internal.h
+++ b/src/julia_internal.h
@@ -582,8 +582,8 @@ STATIC_INLINE void jl_gc_wb_buf(void *parent, void *bufptr, size_t minsz) JL_NOT
     }
 }
 
-void jl_gc_debug_print_status(void);
-JL_DLLEXPORT void jl_gc_debug_critical_error(void);
+void jl_gc_debug_print_status(void) JL_NOTSAFEPOINT;
+JL_DLLEXPORT void jl_gc_debug_critical_error(void) JL_NOTSAFEPOINT;
 void jl_print_gc_stats(JL_STREAM *s);
 void jl_gc_reset_alloc_count(void);
 uint32_t jl_get_gs_ctr(void);
@@ -1173,7 +1173,7 @@ size_t rec_backtrace_ctx_dwarf(jl_bt_element_t *bt_data, size_t maxsize, bt_cont
 #endif
 JL_DLLEXPORT jl_value_t *jl_get_backtrace(void);
 void jl_critical_error(int sig, int si_code, bt_context_t *context, jl_task_t *ct);
-JL_DLLEXPORT void jl_raise_debugger(void);
+JL_DLLEXPORT void jl_raise_debugger(void) JL_NOTSAFEPOINT;
 int jl_getFunctionInfo(jl_frame_t **frames, uintptr_t pointer, int skipC, int noInline) JL_NOTSAFEPOINT;
 JL_DLLEXPORT void jl_gdblookup(void* ip) JL_NOTSAFEPOINT;
 void jl_print_native_codeloc(char *pre_str, uintptr_t ip) JL_NOTSAFEPOINT;
diff --git a/src/julia_threads.h b/src/julia_threads.h
index 847465b363a2e..a102c6904cd8c 100644
--- a/src/julia_threads.h
+++ b/src/julia_threads.h
@@ -4,6 +4,7 @@
 #ifndef JL_THREADS_H
 #define JL_THREADS_H
 
+#include "work-stealing-queue.h"
 #include "julia_atomics.h"
 #ifndef _OS_WINDOWS_
 #include "pthread.h"
@@ -171,16 +172,11 @@ typedef struct {
     arraylist_t free_stacks[JL_N_STACK_POOLS];
 } jl_thread_heap_t;
 
-// Cache of thread local change to global metadata during GC
-// This is sync'd after marking.
-typedef union _jl_gc_mark_data jl_gc_mark_data_t;
-
 typedef struct {
-    void **pc; // Current stack address for the pc (up growing)
-    jl_gc_mark_data_t *data; // Current stack address for the data (up growing)
-    void **pc_start; // Cached value of `gc_cache->pc_stack`
-    void **pc_end; // Cached value of `gc_cache->pc_stack_end`
-} jl_gc_mark_sp_t;
+    ws_queue_t chunk_queue;
+    ws_queue_t ptr_queue;
+    arraylist_t reclaim_set;
+} jl_gc_markqueue_t;
 
 typedef struct {
     // thread local increment of `perm_scanned_bytes`
@@ -198,9 +194,6 @@ typedef struct {
     // this makes sure that a single objects can only appear once in
     // the lists (the mark bit cannot be flipped to `0` without sweeping)
     void *big_obj[1024];
-    void **pc_stack;
-    void **pc_stack_end;
-    jl_gc_mark_data_t *data_stack;
 } jl_gc_mark_cache_t;
 
 struct _jl_bt_element_t;
@@ -266,9 +259,9 @@ typedef struct _jl_tls_states_t {
 #endif
     jl_thread_t system_id;
     arraylist_t finalizers;
+    jl_gc_markqueue_t mark_queue;
     jl_gc_mark_cache_t gc_cache;
     arraylist_t sweep_objs;
-    jl_gc_mark_sp_t gc_mark_sp;
     // Saved exception for previous *external* API call or NULL if cleared.
     // Access via jl_exception_occurred().
     struct _jl_value_t *previous_exception;
diff --git a/src/options.h b/src/options.h
index 82b71431ecea0..0e6603c59d682 100644
--- a/src/options.h
+++ b/src/options.h
@@ -134,6 +134,9 @@
 // threadpools specification
 #define THREADPOOLS_NAME                "JULIA_THREADPOOLS"
 
+// GC threads
+#define NUM_GC_THREADS_NAME             "JULIA_NUM_GC_THREADS"
+
 // affinitization behavior
 #define MACHINE_EXCLUSIVE_NAME          "JULIA_EXCLUSIVE"
 #define DEFAULT_MACHINE_EXCLUSIVE       0
diff --git a/src/partr.c b/src/partr.c
index bbcf5048cace2..793b48481850f 100644
--- a/src/partr.c
+++ b/src/partr.c
@@ -78,7 +78,7 @@ JL_DLLEXPORT int jl_set_task_threadpoolid(jl_task_t *task, int8_t tpid) JL_NOTSA
 
 // GC functions used
 extern int jl_gc_mark_queue_obj_explicit(jl_gc_mark_cache_t *gc_cache,
-                                         jl_gc_mark_sp_t *sp, jl_value_t *obj) JL_NOTSAFEPOINT;
+                                         jl_gc_markqueue_t *mq, jl_value_t *obj) JL_NOTSAFEPOINT;
 
 // parallel task runtime
 // ---
@@ -108,7 +108,37 @@ void jl_init_threadinginfra(void)
 
 void JL_NORETURN jl_finish_task(jl_task_t *t);
 
-// thread function: used by all except the main thread
+extern uv_mutex_t gc_threads_lock;
+extern uv_cond_t gc_threads_cond;
+extern _Atomic(int) gc_n_threads_marking;
+extern void gc_mark_loop_parallel(jl_ptls_t ptls, int master);
+
+// gc thread function
+void jl_gc_threadfun(void *arg)
+{
+    jl_threadarg_t *targ = (jl_threadarg_t*)arg;
+
+    // initialize this thread (set tid and create heap)
+    jl_ptls_t ptls = jl_init_threadtls(targ->tid);
+
+    // wait for all threads
+    jl_gc_state_set(ptls, JL_GC_STATE_WAITING, 0);
+    uv_barrier_wait(targ->barrier);
+
+    // free the thread argument here
+    free(targ);
+
+    while (1) {
+        uv_mutex_lock(&gc_threads_lock);
+        while (jl_atomic_load(&gc_n_threads_marking) == 0) {
+            uv_cond_wait(&gc_threads_cond, &gc_threads_lock);
+        }
+        uv_mutex_unlock(&gc_threads_lock);
+        gc_mark_loop_parallel(ptls, 0);
+    }
+}
+
+// thread function: used by all mutator threads except the main thread
 void jl_threadfun(void *arg)
 {
     jl_threadarg_t *targ = (jl_threadarg_t*)arg;
@@ -419,7 +449,6 @@ JL_DLLEXPORT jl_task_t *jl_task_get_next(jl_value_t *trypoptask, jl_value_t *q,
             uv_mutex_lock(&ptls->sleep_lock);
             while (may_sleep(ptls)) {
                 uv_cond_wait(&ptls->wake_signal, &ptls->sleep_lock);
-                // TODO: help with gc work here, if applicable
             }
             assert(jl_atomic_load_relaxed(&ptls->sleep_check_state) == not_sleeping);
             uv_mutex_unlock(&ptls->sleep_lock);
diff --git a/src/support/MurmurHash3.c b/src/support/MurmurHash3.c
index 43bd015ddd69f..1f7056d0e7e56 100644
--- a/src/support/MurmurHash3.c
+++ b/src/support/MurmurHash3.c
@@ -12,8 +12,6 @@
 //-----------------------------------------------------------------------------
 // Platform-specific functions and macros
 
-#define FORCE_INLINE inline __attribute__((always_inline))
-
 static inline uint32_t rotl32 ( uint32_t x, int8_t r )
 {
   return (x << r) | (x >> (32 - r));
diff --git a/src/support/dtypes.h b/src/support/dtypes.h
index d49ae0b22b5f9..891c091413084 100644
--- a/src/support/dtypes.h
+++ b/src/support/dtypes.h
@@ -117,6 +117,7 @@ typedef intptr_t ssize_t;
 #define LLT_FREE(x) free(x)
 
 #define STATIC_INLINE static inline
+#define FORCE_INLINE static inline __attribute__((always_inline))
 
 #if defined(_OS_WINDOWS_) && !defined(_COMPILER_GCC_)
 #  define NOINLINE __declspec(noinline)
diff --git a/src/threading.c b/src/threading.c
index 8500279220825..7ed8e3b6e7dc9 100644
--- a/src/threading.c
+++ b/src/threading.c
@@ -562,6 +562,8 @@ static void jl_check_tls(void)
 JL_DLLEXPORT const int jl_tls_elf_support = 0;
 #endif
 
+extern int gc_first_tid;
+
 // interface to Julia; sets up to make the runtime thread-safe
 void jl_init_threading(void)
 {
@@ -614,13 +616,32 @@ void jl_init_threading(void)
         }
     }
 
-    jl_all_tls_states_size = nthreads + nthreadsi;
+    int16_t ngcthreads = jl_options.ngcthreads - 1;
+    if (ngcthreads == -1 &&
+        (cp = getenv(NUM_GC_THREADS_NAME))) { // ENV[NUM_GC_THREADS_NAME] specified
+
+        ngcthreads = (uint64_t)strtol(cp, NULL, 10) - 1;
+    }
+    if (ngcthreads == -1) {
+        // if `--gcthreads` was not specified, set the number of GC threads
+        // to the number of compute threads
+        if (nthreads <= 1) {
+            ngcthreads = 0;
+        }
+        else {
+            ngcthreads = nthreads - 1;
+        }
+    }
+
+    jl_all_tls_states_size = nthreads + nthreadsi + ngcthreads;
     jl_n_threads_per_pool = (int*)malloc_s(2 * sizeof(int));
     jl_n_threads_per_pool[0] = nthreadsi;
     jl_n_threads_per_pool[1] = nthreads;
 
     jl_atomic_store_release(&jl_all_tls_states, (jl_ptls_t*)calloc(jl_all_tls_states_size, sizeof(jl_ptls_t)));
     jl_atomic_store_release(&jl_n_threads, jl_all_tls_states_size);
+    jl_n_gcthreads = ngcthreads;
+    gc_first_tid = nthreads;
 }
 
 static uv_barrier_t thread_init_done;
@@ -628,6 +649,7 @@ static uv_barrier_t thread_init_done;
 void jl_start_threads(void)
 {
     int nthreads = jl_atomic_load_relaxed(&jl_n_threads);
+    int ngcthreads = jl_n_gcthreads;
     int cpumasksize = uv_cpumask_size();
     char *cp;
     int i, exclusive;
@@ -660,15 +682,23 @@ void jl_start_threads(void)
     // create threads
     uv_barrier_init(&thread_init_done, nthreads);
 
+    // GC/System threads need to be after the worker threads.
+    int nworker_threads = nthreads - ngcthreads;
+
     for (i = 1; i < nthreads; ++i) {
         jl_threadarg_t *t = (jl_threadarg_t *)malloc_s(sizeof(jl_threadarg_t)); // ownership will be passed to the thread
         t->tid = i;
         t->barrier = &thread_init_done;
-        uv_thread_create(&uvtid, jl_threadfun, t);
-        if (exclusive) {
-            mask[i] = 1;
-            uv_thread_setaffinity(&uvtid, mask, NULL, cpumasksize);
-            mask[i] = 0;
+        if (i < nworker_threads) {
+            uv_thread_create(&uvtid, jl_threadfun, t);
+            if (exclusive) {
+                mask[i] = 1;
+                uv_thread_setaffinity(&uvtid, mask, NULL, cpumasksize);
+                mask[i] = 0;
+            }
+        }
+        else {
+            uv_thread_create(&uvtid, jl_gc_threadfun, t);
         }
         uv_thread_detach(&uvtid);
     }
diff --git a/src/threading.h b/src/threading.h
index 9fd63f0fd188d..1cf2d4a9d3711 100644
--- a/src/threading.h
+++ b/src/threading.h
@@ -25,6 +25,7 @@ jl_ptls_t jl_init_threadtls(int16_t tid);
 
 // provided by a threading infrastructure
 void jl_init_threadinginfra(void);
+void jl_gc_threadfun(void *arg);
 void jl_threadfun(void *arg);
 
 #ifdef __cplusplus
diff --git a/src/work-stealing-queue.h b/src/work-stealing-queue.h
new file mode 100644
index 0000000000000..38429e02886e9
--- /dev/null
+++ b/src/work-stealing-queue.h
@@ -0,0 +1,102 @@
+// This file is a part of Julia. License is MIT: https://julialang.org/license
+
+#ifndef WORK_STEALING_QUEUE_H
+#define WORK_STEALING_QUEUE_H
+
+#include "julia_atomics.h"
+#include "assert.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// =======
+// Chase and Lev's work-stealing queue, optimized for
+// weak memory models by Le et al.
+//
+// * Chase D., Lev Y. Dynamic Circular Work-Stealing queue
+// * Le N. M. et al. Correct and Efficient Work-Stealing for
+//   Weak Memory Models
+// =======
+
+typedef struct {
+    char *buffer;
+    int32_t capacity;
+    int32_t mask;
+} ws_array_t;
+
+static inline ws_array_t *create_ws_array(size_t capacity, int32_t eltsz) JL_NOTSAFEPOINT
+{
+    ws_array_t *a = (ws_array_t *)malloc_s(sizeof(ws_array_t));
+    a->buffer = (char *)malloc_s(capacity * eltsz);
+    a->capacity = capacity;
+    a->mask = capacity - 1;
+    return a;
+}
+
+typedef struct {
+    _Atomic(int64_t) top;
+    _Atomic(int64_t) bottom;
+    _Atomic(ws_array_t *) array;
+} ws_queue_t;
+
+static inline ws_array_t *ws_queue_push(ws_queue_t *q, void *elt, int32_t eltsz) JL_NOTSAFEPOINT
+{
+    int64_t b = jl_atomic_load_relaxed(&q->bottom);
+    int64_t t = jl_atomic_load_acquire(&q->top);
+    ws_array_t *ary = jl_atomic_load_relaxed(&q->array);
+    ws_array_t *old_ary = NULL;
+    if (__unlikely(b - t > ary->capacity - 1)) {
+        ws_array_t *new_ary = create_ws_array(2 * ary->capacity, eltsz);
+        for (int i = 0; i < ary->capacity; i++) {
+            memcpy(new_ary->buffer + ((t + i) & new_ary->mask) * eltsz, ary->buffer + ((t + i) & ary->mask) * eltsz, eltsz);
+        }
+        jl_atomic_store_release(&q->array, new_ary);
+        old_ary = ary;
+        ary = new_ary;
+    }
+    memcpy(ary->buffer + (b & ary->mask) * eltsz, elt, eltsz);
+    jl_fence_release();
+    jl_atomic_store_relaxed(&q->bottom, b + 1);
+    return old_ary;
+}
+
+static inline void ws_queue_pop(ws_queue_t *q, void *dest, int32_t eltsz) JL_NOTSAFEPOINT
+{
+    int64_t b = jl_atomic_load_relaxed(&q->bottom) - 1;
+    ws_array_t *ary = jl_atomic_load_relaxed(&q->array);
+    jl_atomic_store_relaxed(&q->bottom, b);
+    jl_fence();
+    int64_t t = jl_atomic_load_relaxed(&q->top);
+    if (__likely(t <= b)) {
+        memcpy(dest, ary->buffer + (b & ary->mask) * eltsz, eltsz);
+        if (t == b) {
+            if (!jl_atomic_cmpswap(&q->top, &t, t + 1))
+                memset(dest, 0, eltsz);
+            jl_atomic_store_relaxed(&q->bottom, b + 1);
+        }
+    }
+    else {
+        memset(dest, 0, eltsz);
+        jl_atomic_store_relaxed(&q->bottom, b + 1);
+    }
+}
+
+static inline void ws_queue_steal_from(ws_queue_t *q, void *dest, int32_t eltsz) JL_NOTSAFEPOINT
+{
+    int64_t t = jl_atomic_load_acquire(&q->top);
+    jl_fence();
+    int64_t b = jl_atomic_load_acquire(&q->bottom);
+    if (t < b) {
+        ws_array_t *ary = jl_atomic_load_relaxed(&q->array);
+        memcpy(dest, ary->buffer + (t & ary->mask) * eltsz, eltsz);
+        if (!jl_atomic_cmpswap(&q->top, &t, t + 1))
+            memset(dest, 0, eltsz);
+    }
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/stdlib/Distributed/src/cluster.jl b/stdlib/Distributed/src/cluster.jl
index 554a3d9185080..d8039ba160d66 100644
--- a/stdlib/Distributed/src/cluster.jl
+++ b/stdlib/Distributed/src/cluster.jl
@@ -1345,9 +1345,10 @@ function process_opts(opts)
     end
 
     # Propagate --threads to workers
-    threads = get_threads_spec(opts)
+    threads = opts.nthreads > 0 ? `--threads=$(opts.nthreads)` : ``
+    gcthreads = opts.ngcthreads > 0 ? `--gcthreads=$(opts.ngcthreads)` : ``
 
-    exeflags = `$threads`
+    exeflags = `$threads $gcthreads`
 
     # add processors
     if opts.nprocs > 0
diff --git a/test/choosetests.jl b/test/choosetests.jl
index 334ef051a0fe6..7d426221180f5 100644
--- a/test/choosetests.jl
+++ b/test/choosetests.jl
@@ -21,7 +21,7 @@ const TESTNAMES = [
         "combinatorics", "sysinfo", "env", "rounding", "ranges", "mod2pi",
         "euler", "show", "client",
         "errorshow", "sets", "goto", "llvmcall", "llvmcall2", "ryu",
-        "some", "meta", "stacktraces", "docs",
+        "some", "meta", "stacktraces", "docs", "gc",
         "misc", "threads", "stress", "binaryplatforms", "atexit",
         "enums", "cmdlineargs", "int", "interpreter",
         "checked", "bitset", "floatfuncs", "precompile",
diff --git a/test/cmdlineargs.jl b/test/cmdlineargs.jl
index 7f86534f923b3..f73a7854fd2f1 100644
--- a/test/cmdlineargs.jl
+++ b/test/cmdlineargs.jl
@@ -269,6 +269,24 @@ let exename = `$(Base.julia_cmd()) --startup-file=no --color=no`
         @test p.exitcode == 1 && p.termsignal == 0
     end
 
+    # --gcthreads
+    code = "print(Threads.ngcthreads())"
+    cpu_threads = ccall(:jl_effective_threads, Int32, ())
+    @test (cpu_threads == 1 ? "1" : string(div(cpu_threads, 2))) ==
+          read(`$exename --threads auto -e $code`, String) ==
+          read(`$exename --threads=auto -e $code`, String) ==
+          read(`$exename -tauto -e $code`, String) ==
+          read(`$exename -t auto -e $code`, String)
+    for nt in (nothing, "1")
+        withenv("JULIA_NUM_GC_THREADS" => nt) do
+            @test read(`$exename --gcthreads=2 -e $code`, String) == "2"
+        end
+    end
+
+    withenv("JULIA_NUM_GC_THREADS" => 2) do
+        @test read(`$exename -e $code`, String) == "2"
+    end
+
     # --machine-file
     # this does not check that machine file works,
     # only that the filename gets correctly passed to the option struct
diff --git a/test/gc.jl b/test/gc.jl
new file mode 100644
index 0000000000000..9cc9d753dfc09
--- /dev/null
+++ b/test/gc.jl
@@ -0,0 +1,18 @@
+# This file is a part of Julia. License is MIT: https://julialang.org/license
+
+using Test
+
+function run_gctest(file)
+    let cmd = `$(Base.julia_cmd()) --depwarn=error --rr-detach --startup-file=no $file`
+        for test_nthreads in (1, 2, 4)
+            new_env = copy(ENV)
+            new_env["JULIA_NUM_THREADS"] = string(test_nthreads)
+            new_env["JULIA_NUM_GC_THREADS"] = string(test_nthreads)
+            @time run(pipeline(setenv(cmd, new_env), stdout = stdout, stderr = stderr))
+        end
+    end
+end
+
+@time run_gctest("gc/binarytree.jl")
+@time run_gctest("gc/linkedlist.jl")
+@time run_gctest("gc/objarray.jl")
diff --git a/test/gc/binarytree.jl b/test/gc/binarytree.jl
new file mode 100644
index 0000000000000..3089e2d2ce869
--- /dev/null
+++ b/test/gc/binarytree.jl
@@ -0,0 +1,53 @@
+# This file is a part of Julia. License is MIT: https://julialang.org/license
+
+module BinaryTreeMutable
+
+# Adopted from
+# https://benchmarksgame-team.pages.debian.net/benchmarksgame/description/binarytrees.html#binarytrees
+
+using Base.Threads
+using Printf
+
+mutable struct Node
+    l::Union{Nothing, Node}
+    r::Union{Nothing, Node}
+end
+
+function make(n::Int)
+    return n === 0 ? Node(nothing, nothing) : Node(make(n-1), make(n-1))
+end
+
+function check(node::Node)
+    return  1 + (node.l === nothing ? 0 : check(node.l) + check(node.r))
+end
+
+function binary_trees(io, n::Int)
+    @printf io "stretch tree of depth %jd\t check: %jd\n" n+1 check(make(n+1))
+
+    long_tree = make(n)
+    minDepth = 4
+    resultSize = div((n - minDepth), 2) + 1
+    results = Vector{String}(undef, resultSize)
+    Threads.@threads for depth in minDepth:2:n
+        c = 0
+        niter = 1 << (n - depth + minDepth)
+        for _ in 1:niter
+            c += check(make(depth))
+        end
+        index = div((depth - minDepth),2) + 1
+        results[index] = @sprintf "%jd\t trees of depth %jd\t check: %jd\n" niter depth c
+    end
+
+    for i in results
+        write(io, i)
+    end
+
+    @printf io "long lived tree of depth %jd\t check: %jd\n" n check(long_tree)
+end
+
+end #module
+
+using .BinaryTreeMutable
+
+BinaryTreeMutable.binary_trees(devnull, 20)
+GC.gc()
diff --git a/test/gc/linkedlist.jl b/test/gc/linkedlist.jl
new file mode 100644
index 0000000000000..c447a9680326d
--- /dev/null
+++ b/test/gc/linkedlist.jl
@@ -0,0 +1,21 @@
+# This file is a part of Julia. License is MIT: https://julialang.org/license
+
+mutable struct ListNode
+  key::Int64
+  next::ListNode
+  ListNode() = new()
+  ListNode(x)= new(x)
+  ListNode(x,y) = new(x,y);
+end
+
+function list(n=128)
+    start::ListNode = ListNode(1)
+    current::ListNode = start
+    for i = 2:(n*1024^2)
+        current = ListNode(i,current)
+    end
+    return current.key
+end
+
+_ = list()
+GC.gc()
diff --git a/test/gc/objarray.jl b/test/gc/objarray.jl
new file mode 100644
index 0000000000000..4b4cb67c42eac
--- /dev/null
+++ b/test/gc/objarray.jl
@@ -0,0 +1,35 @@
+# This file is a part of Julia. License is MIT: https://julialang.org/license
+
+using Random: seed!
+seed!(1)
+
+abstract type Cell end
+
+struct CellA<:Cell
+    a::Ref{Int}
+end
+
+struct CellB<:Cell
+    b::String
+end
+
+function fillcells!(mc::Array{Cell})
+    for ind in eachindex(mc)
+        mc[ind] = ifelse(rand() > 0.5, CellA(ind), CellB(string(ind)))
+    end
+    return mc
+end
+
+function work(size)
+    mcells = Array{Cell}(undef, size, size)
+    fillcells!(mcells)
+end
+
+function run(maxsize)
+    Threads.@threads for i in 1:maxsize
+        work(i*500)
+    end
+end
+
+run(4)
+GC.gc()