diff --git a/NEWS.md b/NEWS.md index 695867f2e4f63..18a5b0e620e7d 100644 --- a/NEWS.md +++ b/NEWS.md @@ -38,6 +38,7 @@ Compiler/Runtime improvements * All uses of the `@pure` macro in `Base` have been replaced with the now-preferred `Base.@assume_effects` ([#44776]). * `invoke(f, invokesig, args...)` calls to a less-specific method than would normally be chosen for `f(args...)` are no longer spuriously invalidated when loading package precompile files ([#46010]). +* The mark phase of the Garbage Collector is now multi-threaded ([#48600]). Command-line option changes --------------------------- @@ -49,6 +50,8 @@ Command-line option changes number of interactive threads to create (`auto` currently means 1) ([#42302]). * New option `--heap-size-hint=` suggests a size limit to invoke garbage collection more eagerly. The size may be specified in bytes, kilobytes (1000k), megabytes (300M), or gigabytes (1.5G) ([#45369]). +* New option `--gcthreads` to set how many threads will be used by the Garbage Collector ([#48600]). + The default is set to `N/2` where `N` is the amount of worker threads (`--threads`) used by Julia. Multi-threading changes ----------------------- diff --git a/base/options.jl b/base/options.jl index b9fa5b6ec5508..fb043672dc19a 100644 --- a/base/options.jl +++ b/base/options.jl @@ -11,6 +11,7 @@ struct JLOptions cpu_target::Ptr{UInt8} nthreadpools::Int16 nthreads::Int16 + ngcthreads::Int16 nthreads_per_pool::Ptr{Int16} nprocs::Int32 machine_file::Ptr{UInt8} diff --git a/base/threadingconstructs.jl b/base/threadingconstructs.jl index 0c6563c54f99f..ce216f290ae12 100644 --- a/base/threadingconstructs.jl +++ b/base/threadingconstructs.jl @@ -136,6 +136,13 @@ function threadpooltids(pool::Symbol) end end +""" + Threads.ngcthreads() -> Int + +Returns the number of GC threads currently configured. +""" +ngcthreads() = Int(unsafe_load(cglobal(:jl_n_gcthreads, Cint))) + 1 + function threading_run(fun, static) ccall(:jl_enter_threaded_region, Cvoid, ()) n = threadpoolsize() diff --git a/doc/man/julia.1 b/doc/man/julia.1 index 383c588c58dae..fa9f641b1e76f 100644 --- a/doc/man/julia.1 +++ b/doc/man/julia.1 @@ -118,6 +118,11 @@ supported (Linux and Windows). If this is not supported (macOS) or process affinity is not configured, it uses the number of CPU threads. +.TP +--gcthreads +Enable n GC threads; If unspecified is set to half of the +compute worker threads. + .TP -p, --procs {N|auto} Integer value N launches N additional local worker processes `auto` launches as many workers diff --git a/doc/src/base/multi-threading.md b/doc/src/base/multi-threading.md index 4932aef4cc938..fb75b21479707 100644 --- a/doc/src/base/multi-threading.md +++ b/doc/src/base/multi-threading.md @@ -10,6 +10,7 @@ Base.Threads.nthreads Base.Threads.threadpool Base.Threads.nthreadpools Base.Threads.threadpoolsize +Base.Threads.ngcthreads ``` See also [Multi-Threading](@ref man-multithreading). diff --git a/doc/src/manual/command-line-interface.md b/doc/src/manual/command-line-interface.md index 54c56a354c7a3..781a77a33dadb 100644 --- a/doc/src/manual/command-line-interface.md +++ b/doc/src/manual/command-line-interface.md @@ -106,8 +106,9 @@ The following is a complete list of command-line switches available when launchi |`-e`, `--eval ` |Evaluate ``| |`-E`, `--print ` |Evaluate `` and display the result| |`-L`, `--load ` |Load `` immediately on all processors| -|`-t`, `--threads {N\|auto`} |Enable N threads; `auto` tries to infer a useful default number of threads to use but the exact behavior might change in the future. Currently, `auto` uses the number of CPUs assigned to this julia process based on the OS-specific affinity assignment interface, if supported (Linux and Windows). If this is not supported (macOS) or process affinity is not configured, it uses the number of CPU threads.| -|`-p`, `--procs {N\|auto`} |Integer value N launches N additional local worker processes; `auto` launches as many workers as the number of local CPU threads (logical cores)| +|`-t`, `--threads {N\|auto}` |Enable N threads; `auto` tries to infer a useful default number of threads to use but the exact behavior might change in the future. Currently, `auto` uses the number of CPUs assigned to this julia process based on the OS-specific affinity assignment interface, if supported (Linux and Windows). If this is not supported (macOS) or process affinity is not configured, it uses the number of CPU threads.| +| `--gcthreads {N}` |Enable N GC threads; If unspecified is set to half of the compute worker threads.| +|`-p`, `--procs {N\|auto}` |Integer value N launches N additional local worker processes; `auto` launches as many workers as the number of local CPU threads (logical cores)| |`--machine-file ` |Run processes on hosts listed in ``| |`-i` |Interactive mode; REPL runs and `isinteractive()` is true| |`-q`, `--quiet` |Quiet startup: no banner, suppress REPL warnings| diff --git a/doc/src/manual/environment-variables.md b/doc/src/manual/environment-variables.md index f29e5b7aaf8f7..68fb79fc0a9a6 100644 --- a/doc/src/manual/environment-variables.md +++ b/doc/src/manual/environment-variables.md @@ -315,6 +315,14 @@ then spinning threads never sleep. Otherwise, `$JULIA_THREAD_SLEEP_THRESHOLD` is interpreted as an unsigned 64-bit integer (`uint64_t`) and gives, in nanoseconds, the amount of time after which spinning threads should sleep. +### [`JULIA_NUM_GC_THREADS`](@id env-gc-threads) + +Sets the number of threads used by Garbage Collection. If unspecified is set to +half of the number of worker threads. + +!!! compat "Julia 1.10" + The environment variable was added in 1.10 + ### `JULIA_EXCLUSIVE` If set to anything besides `0`, then Julia's thread policy is consistent with diff --git a/doc/src/manual/multi-threading.md b/doc/src/manual/multi-threading.md index f8c6b040941f4..e1cb17bd5b93f 100644 --- a/doc/src/manual/multi-threading.md +++ b/doc/src/manual/multi-threading.md @@ -72,6 +72,15 @@ julia> Threads.threadid() three processes have 2 threads enabled. For more fine grained control over worker threads use [`addprocs`](@ref) and pass `-t`/`--threads` as `exeflags`. +### Multiple GC Threads + +The Garbage Collector (GC) can use multiple threads. The amount used is either half the number +of compute worker threads or configured by either the `--gcthreads` command line argument or by using the +[`JULIA_NUM_GC_THREADS`](@ref env-gc-threads) environment variable. + +!!! compat "Julia 1.10" + The `--gcthreads` command line argument requires at least Julia 1.10. + ## [Threadpools](@id man-threadpools) When a program's threads are busy with many tasks to run, tasks may experience diff --git a/src/Makefile b/src/Makefile index 6f10bf505e2ef..51d8bf1f2de65 100644 --- a/src/Makefile +++ b/src/Makefile @@ -99,7 +99,7 @@ ifeq ($(USE_SYSTEM_LIBUV),0) UV_HEADERS += uv.h UV_HEADERS += uv/*.h endif -PUBLIC_HEADERS := $(BUILDDIR)/julia_version.h $(wildcard $(SRCDIR)/support/*.h) $(addprefix $(SRCDIR)/,julia.h julia_assert.h julia_threads.h julia_fasttls.h julia_locks.h julia_atomics.h jloptions.h) +PUBLIC_HEADERS := $(BUILDDIR)/julia_version.h $(wildcard $(SRCDIR)/support/*.h) $(addprefix $(SRCDIR)/,work-stealing-queue.h julia.h julia_assert.h julia_threads.h julia_fasttls.h julia_locks.h julia_atomics.h jloptions.h) ifeq ($(OS),WINNT) PUBLIC_HEADERS += $(addprefix $(SRCDIR)/,win32_ucontext.h) endif diff --git a/src/gc-debug.c b/src/gc-debug.c index 3f60ca17e0dc4..0f4120920ce76 100644 --- a/src/gc-debug.c +++ b/src/gc-debug.c @@ -198,21 +198,32 @@ static void restore(void) static void gc_verify_track(jl_ptls_t ptls) { - jl_gc_mark_cache_t *gc_cache = &ptls->gc_cache; + // `gc_verify_track` is limited to single-threaded GC + if (jl_n_gcthreads != 0) + return; do { - jl_gc_mark_sp_t sp; - gc_mark_sp_init(gc_cache, &sp); + jl_gc_markqueue_t mq; + jl_gc_markqueue_t *mq2 = &ptls->mark_queue; + ws_queue_t *cq = &mq.chunk_queue; + ws_queue_t *q = &mq.ptr_queue; + jl_atomic_store_relaxed(&cq->top, 0); + jl_atomic_store_relaxed(&cq->bottom, 0); + jl_atomic_store_relaxed(&cq->array, jl_atomic_load_relaxed(&mq2->chunk_queue.array)); + jl_atomic_store_relaxed(&q->top, 0); + jl_atomic_store_relaxed(&q->bottom, 0); + jl_atomic_store_relaxed(&q->array, jl_atomic_load_relaxed(&mq2->ptr_queue.array)); + arraylist_new(&mq.reclaim_set, 32); arraylist_push(&lostval_parents_done, lostval); jl_safe_printf("Now looking for %p =======\n", lostval); clear_mark(GC_CLEAN); - gc_mark_queue_all_roots(ptls, &sp); - gc_mark_queue_finlist(gc_cache, &sp, &to_finalize, 0); - for (int i = 0; i < gc_n_threads; i++) { + gc_mark_queue_all_roots(ptls, &mq); + gc_mark_finlist(&mq, &to_finalize, 0); + for (int i = 0; i < gc_n_threads;i++) { jl_ptls_t ptls2 = gc_all_tls_states[i]; - gc_mark_queue_finlist(gc_cache, &sp, &ptls2->finalizers, 0); + gc_mark_finlist(&mq, &ptls2->finalizers, 0); } - gc_mark_queue_finlist(gc_cache, &sp, &finalizer_list_marked, 0); - gc_mark_loop(ptls, sp); + gc_mark_finlist(&mq, &finalizer_list_marked, 0); + gc_mark_loop_serial_(ptls, &mq); if (lostval_parents.len == 0) { jl_safe_printf("Could not find the missing link. We missed a toplevel root. This is odd.\n"); break; @@ -246,22 +257,35 @@ static void gc_verify_track(jl_ptls_t ptls) void gc_verify(jl_ptls_t ptls) { - jl_gc_mark_cache_t *gc_cache = &ptls->gc_cache; - jl_gc_mark_sp_t sp; - gc_mark_sp_init(gc_cache, &sp); + // `gc_verify` is limited to single-threaded GC + if (jl_n_gcthreads != 0) { + jl_safe_printf("Warn. GC verify disabled in multi-threaded GC\n"); + return; + } + jl_gc_markqueue_t mq; + jl_gc_markqueue_t *mq2 = &ptls->mark_queue; + ws_queue_t *cq = &mq.chunk_queue; + ws_queue_t *q = &mq.ptr_queue; + jl_atomic_store_relaxed(&cq->top, 0); + jl_atomic_store_relaxed(&cq->bottom, 0); + jl_atomic_store_relaxed(&cq->array, jl_atomic_load_relaxed(&mq2->chunk_queue.array)); + jl_atomic_store_relaxed(&q->top, 0); + jl_atomic_store_relaxed(&q->bottom, 0); + jl_atomic_store_relaxed(&q->array, jl_atomic_load_relaxed(&mq2->ptr_queue.array)); + arraylist_new(&mq.reclaim_set, 32); lostval = NULL; lostval_parents.len = 0; lostval_parents_done.len = 0; clear_mark(GC_CLEAN); gc_verifying = 1; - gc_mark_queue_all_roots(ptls, &sp); - gc_mark_queue_finlist(gc_cache, &sp, &to_finalize, 0); - for (int i = 0; i < gc_n_threads; i++) { + gc_mark_queue_all_roots(ptls, &mq); + gc_mark_finlist(&mq, &to_finalize, 0); + for (int i = 0; i < gc_n_threads;i++) { jl_ptls_t ptls2 = gc_all_tls_states[i]; - gc_mark_queue_finlist(gc_cache, &sp, &ptls2->finalizers, 0); + gc_mark_finlist(&mq, &ptls2->finalizers, 0); } - gc_mark_queue_finlist(gc_cache, &sp, &finalizer_list_marked, 0); - gc_mark_loop(ptls, sp); + gc_mark_finlist(&mq, &finalizer_list_marked, 0); + gc_mark_loop_serial_(ptls, &mq); int clean_len = bits_save[GC_CLEAN].len; for(int i = 0; i < clean_len + bits_save[GC_OLD].len; i++) { jl_taggedvalue_t *v = (jl_taggedvalue_t*)bits_save[i >= clean_len ? GC_OLD : GC_CLEAN].items[i >= clean_len ? i - clean_len : i]; @@ -500,7 +524,7 @@ int jl_gc_debug_check_other(void) return gc_debug_alloc_check(&jl_gc_debug_env.other); } -void jl_gc_debug_print_status(void) +void jl_gc_debug_print_status(void) JL_NOTSAFEPOINT { uint64_t pool_count = jl_gc_debug_env.pool.num; uint64_t other_count = jl_gc_debug_env.other.num; @@ -509,7 +533,7 @@ void jl_gc_debug_print_status(void) pool_count + other_count, pool_count, other_count, gc_num.pause); } -void jl_gc_debug_critical_error(void) +void jl_gc_debug_critical_error(void) JL_NOTSAFEPOINT { jl_gc_debug_print_status(); if (!jl_gc_debug_env.wait_for_debugger) @@ -1264,139 +1288,6 @@ int gc_slot_to_arrayidx(void *obj, void *_slot) JL_NOTSAFEPOINT return (slot - start) / elsize; } -// Print a backtrace from the bottom (start) of the mark stack up to `sp` -// `pc_offset` will be added to `sp` for convenience in the debugger. -NOINLINE void gc_mark_loop_unwind(jl_ptls_t ptls, jl_gc_mark_sp_t sp, int pc_offset) -{ - jl_jmp_buf *old_buf = jl_get_safe_restore(); - jl_jmp_buf buf; - jl_set_safe_restore(&buf); - if (jl_setjmp(buf, 0) != 0) { - jl_safe_printf("\n!!! ERROR when unwinding gc mark loop -- ABORTING !!!\n"); - jl_set_safe_restore(old_buf); - return; - } - void **top = sp.pc + pc_offset; - jl_gc_mark_data_t *data_top = sp.data; - sp.data = ptls->gc_cache.data_stack; - sp.pc = ptls->gc_cache.pc_stack; - int isroot = 1; - while (sp.pc < top) { - void *pc = *sp.pc; - const char *prefix = isroot ? "r--" : " `-"; - isroot = 0; - if (pc == gc_mark_label_addrs[GC_MARK_L_marked_obj]) { - gc_mark_marked_obj_t *data = gc_repush_markdata(&sp, gc_mark_marked_obj_t); - if ((jl_gc_mark_data_t *)data > data_top) { - jl_safe_printf("Mark stack unwind overflow -- ABORTING !!!\n"); - break; - } - jl_safe_printf("%p: Root object: %p :: %p (bits: %d)\n of type ", - (void*)data, (void*)data->obj, (void*)data->tag, (int)data->bits); - jl_((void*)data->tag); - isroot = 1; - } - else if (pc == gc_mark_label_addrs[GC_MARK_L_scan_only]) { - gc_mark_marked_obj_t *data = gc_repush_markdata(&sp, gc_mark_marked_obj_t); - if ((jl_gc_mark_data_t *)data > data_top) { - jl_safe_printf("Mark stack unwind overflow -- ABORTING !!!\n"); - break; - } - jl_safe_printf("%p: Queued root: %p :: %p (bits: %d)\n of type ", - (void*)data, (void*)data->obj, (void*)data->tag, (int)data->bits); - jl_((void*)data->tag); - isroot = 1; - } - else if (pc == gc_mark_label_addrs[GC_MARK_L_finlist]) { - gc_mark_finlist_t *data = gc_repush_markdata(&sp, gc_mark_finlist_t); - if ((jl_gc_mark_data_t *)data > data_top) { - jl_safe_printf("Mark stack unwind overflow -- ABORTING !!!\n"); - break; - } - jl_safe_printf("%p: Finalizer list from %p to %p\n", - (void*)data, (void*)data->begin, (void*)data->end); - isroot = 1; - } - else if (pc == gc_mark_label_addrs[GC_MARK_L_objarray]) { - gc_mark_objarray_t *data = gc_repush_markdata(&sp, gc_mark_objarray_t); - if ((jl_gc_mark_data_t *)data > data_top) { - jl_safe_printf("Mark stack unwind overflow -- ABORTING !!!\n"); - break; - } - jl_safe_printf("%p: %s Array in object %p :: %p -- [%p, %p)\n of type ", - (void*)data, prefix, (void*)data->parent, ((void**)data->parent)[-1], - (void*)data->begin, (void*)data->end); - jl_(jl_typeof(data->parent)); - } - else if (pc == gc_mark_label_addrs[GC_MARK_L_obj8]) { - gc_mark_obj8_t *data = gc_repush_markdata(&sp, gc_mark_obj8_t); - if ((jl_gc_mark_data_t *)data > data_top) { - jl_safe_printf("Mark stack unwind overflow -- ABORTING !!!\n"); - break; - } - jl_datatype_t *vt = (jl_datatype_t*)jl_typeof(data->parent); - uint8_t *desc = (uint8_t*)jl_dt_layout_ptrs(vt->layout); - jl_safe_printf("%p: %s Object (8bit) %p :: %p -- [%d, %d)\n of type ", - (void*)data, prefix, (void*)data->parent, ((void**)data->parent)[-1], - (int)(data->begin - desc), (int)(data->end - desc)); - jl_(jl_typeof(data->parent)); - } - else if (pc == gc_mark_label_addrs[GC_MARK_L_obj16]) { - gc_mark_obj16_t *data = gc_repush_markdata(&sp, gc_mark_obj16_t); - if ((jl_gc_mark_data_t *)data > data_top) { - jl_safe_printf("Mark stack unwind overflow -- ABORTING !!!\n"); - break; - } - jl_datatype_t *vt = (jl_datatype_t*)jl_typeof(data->parent); - uint16_t *desc = (uint16_t*)jl_dt_layout_ptrs(vt->layout); - jl_safe_printf("%p: %s Object (16bit) %p :: %p -- [%d, %d)\n of type ", - (void*)data, prefix, (void*)data->parent, ((void**)data->parent)[-1], - (int)(data->begin - desc), (int)(data->end - desc)); - jl_(jl_typeof(data->parent)); - } - else if (pc == gc_mark_label_addrs[GC_MARK_L_obj32]) { - gc_mark_obj32_t *data = gc_repush_markdata(&sp, gc_mark_obj32_t); - if ((jl_gc_mark_data_t *)data > data_top) { - jl_safe_printf("Mark stack unwind overflow -- ABORTING !!!\n"); - break; - } - jl_datatype_t *vt = (jl_datatype_t*)jl_typeof(data->parent); - uint32_t *desc = (uint32_t*)jl_dt_layout_ptrs(vt->layout); - jl_safe_printf("%p: %s Object (32bit) %p :: %p -- [%d, %d)\n of type ", - (void*)data, prefix, (void*)data->parent, ((void**)data->parent)[-1], - (int)(data->begin - desc), (int)(data->end - desc)); - jl_(jl_typeof(data->parent)); - } - else if (pc == gc_mark_label_addrs[GC_MARK_L_stack]) { - gc_mark_stackframe_t *data = gc_repush_markdata(&sp, gc_mark_stackframe_t); - if ((jl_gc_mark_data_t *)data > data_top) { - jl_safe_printf("Mark stack unwind overflow -- ABORTING !!!\n"); - break; - } - jl_safe_printf("%p: %s Stack frame %p -- %d of %d (%s)\n", - (void*)data, prefix, (void*)data->s, (int)data->i, - (int)data->nroots >> 1, - (data->nroots & 1) ? "indirect" : "direct"); - } - else if (pc == gc_mark_label_addrs[GC_MARK_L_module_binding]) { - // module_binding - gc_mark_binding_t *data = gc_repush_markdata(&sp, gc_mark_binding_t); - if ((jl_gc_mark_data_t *)data > data_top) { - jl_safe_printf("Mark stack unwind overflow -- ABORTING !!!\n"); - break; - } - jl_safe_printf("%p: %s Module (bindings) %p (bits %d) -- [%p, %p)\n", - (void*)data, prefix, (void*)data->parent, (int)data->bits, - (void*)data->begin, (void*)data->end); - } - else { - jl_safe_printf("Unknown pc %p --- ABORTING !!!\n", pc); - break; - } - } - jl_set_safe_restore(old_buf); -} - static int gc_logging_enabled = 0; JL_DLLEXPORT void jl_enable_gc_logging(int enable) { diff --git a/src/gc.c b/src/gc.c index 341fae2b02553..027c551402b3f 100644 --- a/src/gc.c +++ b/src/gc.c @@ -11,6 +11,18 @@ extern "C" { #endif +// `tid` of mutator thread that triggered GC +_Atomic(int) gc_master_tid; +// `tid` of first GC thread +int gc_first_tid; + +// Mutex/cond used to synchronize sleep/wakeup of GC threads +uv_mutex_t gc_threads_lock; +uv_cond_t gc_threads_cond; + +// Number of threads currently running the GC mark-loop +_Atomic(int) gc_n_threads_marking; + // Linked list of callback functions typedef void (*jl_gc_cb_func_t)(void); @@ -112,17 +124,6 @@ JL_DLLEXPORT void jl_gc_set_cb_notify_external_free(jl_gc_cb_notify_external_fre jl_gc_deregister_callback(&gc_cblist_notify_external_free, (jl_gc_cb_func_t)cb); } -// Save/restore local mark stack to/from thread-local storage. - -STATIC_INLINE void export_gc_state(jl_ptls_t ptls, jl_gc_mark_sp_t *sp) { - ptls->gc_mark_sp = *sp; -} - -STATIC_INLINE void import_gc_state(jl_ptls_t ptls, jl_gc_mark_sp_t *sp) { - // Has the stack been reallocated in the meantime? - *sp = ptls->gc_mark_sp; -} - // Protect all access to `finalizer_list_marked` and `to_finalize`. // For accessing `ptls->finalizers`, the lock is needed if a thread // is going to realloc the buffer (of its own list) or accessing the @@ -327,16 +328,16 @@ void jl_gc_wait_for_the_world(jl_ptls_t* gc_all_tls_states, int gc_n_threads) jl_wake_libuv(); for (int i = 0; i < gc_n_threads; i++) { jl_ptls_t ptls2 = gc_all_tls_states[i]; - if (ptls2 == NULL) - continue; - // This acquire load pairs with the release stores - // in the signal handler of safepoint so we are sure that - // all the stores on those threads are visible. - // We're currently also using atomic store release in mutator threads - // (in jl_gc_state_set), but we may want to use signals to flush the - // memory operations on those threads lazily instead. - while (!jl_atomic_load_relaxed(&ptls2->gc_state) || !jl_atomic_load_acquire(&ptls2->gc_state)) - jl_cpu_pause(); // yield? + if (ptls2 != NULL) { + // This acquire load pairs with the release stores + // in the signal handler of safepoint so we are sure that + // all the stores on those threads are visible. + // We're currently also using atomic store release in mutator threads + // (in jl_gc_state_set), but we may want to use signals to flush the + // memory operations on those threads lazily instead. + while (!jl_atomic_load_relaxed(&ptls2->gc_state) || !jl_atomic_load_acquire(&ptls2->gc_state)) + jl_cpu_pause(); // yield? + } } } @@ -644,7 +645,7 @@ void jl_gc_run_all_finalizers(jl_task_t *ct) schedule_all_finalizers(&finalizer_list_marked); for (int i = 0; i < gc_n_threads; i++) { jl_ptls_t ptls2 = gc_all_tls_states[i]; - if (ptls2) + if (ptls2 != NULL) schedule_all_finalizers(&ptls2->finalizers); } // unlock here because `run_finalizers` locks this @@ -720,7 +721,7 @@ JL_DLLEXPORT void jl_finalize_th(jl_task_t *ct, jl_value_t *o) gc_all_tls_states = jl_atomic_load_relaxed(&jl_all_tls_states); for (int i = 0; i < gc_n_threads; i++) { jl_ptls_t ptls2 = gc_all_tls_states[i]; - if (ptls2) + if (ptls2 != NULL) finalize_object(&ptls2->finalizers, o, &copied_list, jl_atomic_load_relaxed(&ct->tid) != i); } finalize_object(&finalizer_list_marked, o, &copied_list, 0); @@ -760,7 +761,7 @@ static void gc_sweep_foreign_objs(void) assert(gc_n_threads); for (int i = 0; i < gc_n_threads; i++) { jl_ptls_t ptls2 = gc_all_tls_states[i]; - if (ptls2) + if (ptls2 != NULL) gc_sweep_foreign_objs_in_list(&ptls2->sweep_objs); } } @@ -894,7 +895,7 @@ static void gc_sync_all_caches_nolock(jl_ptls_t ptls) assert(gc_n_threads); for (int t_i = 0; t_i < gc_n_threads; t_i++) { jl_ptls_t ptls2 = gc_all_tls_states[t_i]; - if (ptls2) + if (ptls2 != NULL) gc_sync_cache_nolock(ptls, &ptls2->gc_cache); } } @@ -913,21 +914,13 @@ STATIC_INLINE void gc_queue_big_marked(jl_ptls_t ptls, bigval_t *hdr, ptls->gc_cache.nbig_obj = nobj + 1; } -// `gc_setmark_tag` can be called concurrently on multiple threads. -// In all cases, the function atomically sets the mark bits and returns -// the GC bits set as well as if the tag was unchanged by this thread. -// All concurrent calls on the same object are guaranteed to be setting the -// bits to the same value. -// For normal objects, this is the bits with only `GC_MARKED` changed to `1` -// For buffers, this is the bits of the owner object. -// For `mark_reset_age`, this is `GC_MARKED` with `GC_OLD` cleared. -// The return value is `1` if the object was not marked before. -// Returning `0` can happen if another thread marked it in parallel. -STATIC_INLINE int gc_setmark_tag(jl_taggedvalue_t *o, uint8_t mark_mode, - uintptr_t tag, uint8_t *bits) JL_NOTSAFEPOINT -{ - assert(!gc_marked(tag)); +// Atomically set the mark bit for object and return whether it was previously unmarked +FORCE_INLINE int gc_try_setmark_tag(jl_taggedvalue_t *o, uint8_t mark_mode) JL_NOTSAFEPOINT +{ assert(gc_marked(mark_mode)); + uintptr_t tag = o->header; + if (gc_marked(tag)) + return 0; if (mark_reset_age) { // Reset the object as if it was just allocated mark_mode = GC_MARKED; @@ -939,7 +932,6 @@ STATIC_INLINE int gc_setmark_tag(jl_taggedvalue_t *o, uint8_t mark_mode, tag = tag | mark_mode; assert((tag & 0x3) == mark_mode); } - *bits = mark_mode; tag = jl_atomic_exchange_relaxed((_Atomic(uintptr_t)*)&o->header, tag); verify_val(jl_valueof(o)); return !gc_marked(tag); @@ -1022,15 +1014,12 @@ STATIC_INLINE void gc_setmark(jl_ptls_t ptls, jl_taggedvalue_t *o, STATIC_INLINE void gc_setmark_buf_(jl_ptls_t ptls, void *o, uint8_t mark_mode, size_t minsz) JL_NOTSAFEPOINT { jl_taggedvalue_t *buf = jl_astaggedvalue(o); - uintptr_t tag = buf->header; - if (gc_marked(tag)) - return; - uint8_t bits; + uint8_t bits = (gc_old(buf->header) && !mark_reset_age) ? GC_OLD_MARKED : GC_MARKED;; // If the object is larger than the max pool size it can't be a pool object. // This should be accurate most of the time but there might be corner cases // where the size estimate is a little off so we do a pool lookup to make // sure. - if (__likely(gc_setmark_tag(buf, mark_mode, tag, &bits)) && !gc_verifying) { + if (__likely(gc_try_setmark_tag(buf, mark_mode)) && !gc_verifying) { if (minsz <= GC_MAX_SZCLASS) { jl_gc_pagemeta_t *page = page_metadata(buf); if (page) { @@ -1078,7 +1067,7 @@ void jl_gc_force_mark_old(jl_ptls_t ptls, jl_value_t *v) JL_NOTSAFEPOINT jl_gc_queue_root(v); } -static inline void maybe_collect(jl_ptls_t ptls) +STATIC_INLINE void maybe_collect(jl_ptls_t ptls) { if (jl_atomic_load_relaxed(&ptls->gc_num.allocd) >= 0 || jl_gc_debug_check_other()) { jl_gc_collect(JL_GC_AUTO); @@ -1105,14 +1094,14 @@ static void clear_weak_refs(void) assert(gc_n_threads); for (int i = 0; i < gc_n_threads; i++) { jl_ptls_t ptls2 = gc_all_tls_states[i]; - if (ptls2 == NULL) - continue; - size_t n, l = ptls2->heap.weak_refs.len; - void **lst = ptls2->heap.weak_refs.items; - for (n = 0; n < l; n++) { - jl_weakref_t *wr = (jl_weakref_t*)lst[n]; - if (!gc_marked(jl_astaggedvalue(wr->value)->bits.gc)) - wr->value = (jl_value_t*)jl_nothing; + if (ptls2 != NULL) { + size_t n, l = ptls2->heap.weak_refs.len; + void **lst = ptls2->heap.weak_refs.items; + for (n = 0; n < l; n++) { + jl_weakref_t *wr = (jl_weakref_t*)lst[n]; + if (!gc_marked(jl_astaggedvalue(wr->value)->bits.gc)) + wr->value = (jl_value_t*)jl_nothing; + } } } } @@ -1122,27 +1111,27 @@ static void sweep_weak_refs(void) assert(gc_n_threads); for (int i = 0; i < gc_n_threads; i++) { jl_ptls_t ptls2 = gc_all_tls_states[i]; - if (ptls2 == NULL) - continue; - size_t n = 0; - size_t ndel = 0; - size_t l = ptls2->heap.weak_refs.len; - void **lst = ptls2->heap.weak_refs.items; - if (l == 0) - continue; - while (1) { - jl_weakref_t *wr = (jl_weakref_t*)lst[n]; - if (gc_marked(jl_astaggedvalue(wr)->bits.gc)) - n++; - else - ndel++; - if (n >= l - ndel) - break; - void *tmp = lst[n]; - lst[n] = lst[n + ndel]; - lst[n + ndel] = tmp; + if (ptls2 != NULL) { + size_t n = 0; + size_t ndel = 0; + size_t l = ptls2->heap.weak_refs.len; + void **lst = ptls2->heap.weak_refs.items; + if (l == 0) + continue; + while (1) { + jl_weakref_t *wr = (jl_weakref_t*)lst[n]; + if (gc_marked(jl_astaggedvalue(wr)->bits.gc)) + n++; + else + ndel++; + if (n >= l - ndel) + break; + void *tmp = lst[n]; + lst[n] = lst[n + ndel]; + lst[n + ndel] = tmp; + } + ptls2->heap.weak_refs.len -= ndel; } - ptls2->heap.weak_refs.len -= ndel; } } @@ -1150,7 +1139,7 @@ static void sweep_weak_refs(void) // big value list // Size includes the tag and the tag is not cleared!! -static inline jl_value_t *jl_gc_big_alloc_inner(jl_ptls_t ptls, size_t sz) +STATIC_INLINE jl_value_t *jl_gc_big_alloc_inner(jl_ptls_t ptls, size_t sz) { maybe_collect(ptls); size_t offs = offsetof(bigval_t, header); @@ -1182,7 +1171,6 @@ static inline jl_value_t *jl_gc_big_alloc_inner(jl_ptls_t ptls, size_t sz) JL_DLLEXPORT jl_value_t *jl_gc_big_alloc(jl_ptls_t ptls, size_t sz) { jl_value_t *val = jl_gc_big_alloc_inner(ptls, sz); - maybe_record_alloc_to_profile(val, sz, jl_gc_unknown_type_tag); return val; } @@ -1250,9 +1238,8 @@ static void sweep_big(jl_ptls_t ptls, int sweep_full) JL_NOTSAFEPOINT assert(gc_n_threads); for (int i = 0; i < gc_n_threads; i++) { jl_ptls_t ptls2 = gc_all_tls_states[i]; - if (ptls2 == NULL) - continue; - sweep_big_list(sweep_full, &ptls2->heap.big_objects); + if (ptls2 != NULL) + sweep_big_list(sweep_full, &ptls2->heap.big_objects); } if (sweep_full) { bigval_t **last_next = sweep_big_list(sweep_full, &big_objects_marked); @@ -1321,7 +1308,7 @@ static void reset_thread_gc_counts(void) JL_NOTSAFEPOINT gc_all_tls_states = jl_atomic_load_relaxed(&jl_all_tls_states); for (int i = 0; i < gc_n_threads; i++) { jl_ptls_t ptls = gc_all_tls_states[i]; - if (ptls) { + if (ptls != NULL) { memset(&ptls->gc_num, 0, sizeof(ptls->gc_num)); jl_atomic_store_relaxed(&ptls->gc_num.allocd, -(int64_t)gc_num.interval); } @@ -1370,32 +1357,32 @@ static void sweep_malloced_arrays(void) JL_NOTSAFEPOINT assert(gc_n_threads); for (int t_i = 0; t_i < gc_n_threads; t_i++) { jl_ptls_t ptls2 = gc_all_tls_states[t_i]; - if (ptls2 == NULL) - continue; - mallocarray_t *ma = ptls2->heap.mallocarrays; - mallocarray_t **pma = &ptls2->heap.mallocarrays; - while (ma != NULL) { - mallocarray_t *nxt = ma->next; - int bits = jl_astaggedvalue(ma->a)->bits.gc; - if (gc_marked(bits)) { - pma = &ma->next; - } - else { - *pma = nxt; - assert(ma->a->flags.how == 2); - jl_gc_free_array(ma->a); - ma->next = ptls2->heap.mafreelist; - ptls2->heap.mafreelist = ma; + if (ptls2 != NULL) { + mallocarray_t *ma = ptls2->heap.mallocarrays; + mallocarray_t **pma = &ptls2->heap.mallocarrays; + while (ma != NULL) { + mallocarray_t *nxt = ma->next; + int bits = jl_astaggedvalue(ma->a)->bits.gc; + if (gc_marked(bits)) { + pma = &ma->next; + } + else { + *pma = nxt; + assert(ma->a->flags.how == 2); + jl_gc_free_array(ma->a); + ma->next = ptls2->heap.mafreelist; + ptls2->heap.mafreelist = ma; + } + gc_time_count_mallocd_array(bits); + ma = nxt; } - gc_time_count_mallocd_array(bits); - ma = nxt; } } gc_time_mallocd_array_end(); } // pool allocation -static inline jl_taggedvalue_t *reset_page(jl_ptls_t ptls2, const jl_gc_pool_t *p, jl_gc_pagemeta_t *pg, jl_taggedvalue_t *fl) JL_NOTSAFEPOINT +STATIC_INLINE jl_taggedvalue_t *reset_page(jl_ptls_t ptls2, const jl_gc_pool_t *p, jl_gc_pagemeta_t *pg, jl_taggedvalue_t *fl) JL_NOTSAFEPOINT { assert(GC_PAGE_OFFSET >= sizeof(void*)); pg->nfree = (GC_PAGE_SZ - GC_PAGE_OFFSET) / p->osize; @@ -1444,7 +1431,7 @@ static NOINLINE jl_taggedvalue_t *add_page(jl_gc_pool_t *p) JL_NOTSAFEPOINT } // Size includes the tag and the tag is not cleared!! -static inline jl_value_t *jl_gc_pool_alloc_inner(jl_ptls_t ptls, int pool_offset, +STATIC_INLINE jl_value_t *jl_gc_pool_alloc_inner(jl_ptls_t ptls, int pool_offset, int osize) { // Use the pool offset instead of the pool address as the argument @@ -1462,7 +1449,7 @@ static inline jl_value_t *jl_gc_pool_alloc_inner(jl_ptls_t ptls, int pool_offset jl_atomic_load_relaxed(&ptls->gc_num.poolalloc) + 1); // first try to use the freelist jl_taggedvalue_t *v = p->freelist; - if (v) { + if (v != NULL) { jl_taggedvalue_t *next = v->next; p->freelist = next; if (__unlikely(gc_page_data(v) != gc_page_data(next))) { @@ -1482,8 +1469,8 @@ static inline jl_value_t *jl_gc_pool_alloc_inner(jl_ptls_t ptls, int pool_offset // If there's no pages left or the current page is used up, // we need to use the slow path. char *cur_page = gc_page_data((char*)v - 1); - if (__unlikely(!v || cur_page + GC_PAGE_SZ < (char*)next)) { - if (v) { + if (__unlikely(v == NULL || cur_page + GC_PAGE_SZ < (char*)next)) { + if (v != NULL) { // like the freelist case, // but only update the page metadata when it is full jl_gc_pagemeta_t *pg = jl_assume(page_metadata((char*)v - 1)); @@ -1493,7 +1480,7 @@ static inline jl_value_t *jl_gc_pool_alloc_inner(jl_ptls_t ptls, int pool_offset v = *(jl_taggedvalue_t**)cur_page; } // Not an else!! - if (!v) + if (v == NULL) v = add_page(p); next = (jl_taggedvalue_t*)((char*)v + osize); } @@ -1507,7 +1494,6 @@ JL_DLLEXPORT jl_value_t *jl_gc_pool_alloc(jl_ptls_t ptls, int pool_offset, int osize) { jl_value_t *val = jl_gc_pool_alloc_inner(ptls, pool_offset, osize); - maybe_record_alloc_to_profile(val, osize, jl_gc_unknown_type_tag); return val; } @@ -1660,7 +1646,7 @@ static jl_taggedvalue_t **sweep_page(jl_gc_pool_t *p, jl_gc_pagemeta_t *pg, jl_t } // the actual sweeping over all allocated pages in a memory pool -static inline void sweep_pool_page(jl_taggedvalue_t ***pfl, jl_gc_pagemeta_t *pg, int sweep_full) JL_NOTSAFEPOINT +STATIC_INLINE void sweep_pool_page(jl_taggedvalue_t ***pfl, jl_gc_pagemeta_t *pg, int sweep_full) JL_NOTSAFEPOINT { int p_n = pg->pool_n; int t_n = pg->thread_n; @@ -1671,7 +1657,7 @@ static inline void sweep_pool_page(jl_taggedvalue_t ***pfl, jl_gc_pagemeta_t *pg } // sweep over a pagetable0 for all allocated pages -static inline int sweep_pool_pagetable0(jl_taggedvalue_t ***pfl, pagetable0_t *pagetable0, int sweep_full) JL_NOTSAFEPOINT +STATIC_INLINE int sweep_pool_pagetable0(jl_taggedvalue_t ***pfl, pagetable0_t *pagetable0, int sweep_full) JL_NOTSAFEPOINT { unsigned ub = 0; unsigned alloc = 0; @@ -1695,7 +1681,7 @@ static inline int sweep_pool_pagetable0(jl_taggedvalue_t ***pfl, pagetable0_t *p } // sweep over pagetable1 for all pagetable0 that may contain allocated pages -static inline int sweep_pool_pagetable1(jl_taggedvalue_t ***pfl, pagetable1_t *pagetable1, int sweep_full) JL_NOTSAFEPOINT +STATIC_INLINE int sweep_pool_pagetable1(jl_taggedvalue_t ***pfl, pagetable1_t *pagetable1, int sweep_full) JL_NOTSAFEPOINT { unsigned ub = 0; unsigned alloc = 0; @@ -1724,7 +1710,7 @@ static void sweep_pool_pagetable(jl_taggedvalue_t ***pfl, int sweep_full) JL_NOT { if (REGION2_PG_COUNT == 1) { // compile-time optimization pagetable1_t *pagetable1 = memory_map.meta1[0]; - if (pagetable1) + if (pagetable1 != NULL) sweep_pool_pagetable1(pfl, pagetable1, sweep_full); return; } @@ -1823,10 +1809,10 @@ static void gc_sweep_pool(int sweep_full) // null out terminal pointers of free lists for (int t_i = 0; t_i < n_threads; t_i++) { jl_ptls_t ptls2 = gc_all_tls_states[t_i]; - if (ptls2 == NULL) - continue; - for (int i = 0; i < JL_GC_N_POOLS; i++) { - *pfl[t_i * JL_GC_N_POOLS + i] = NULL; + if (ptls2 != NULL) { + for (int i = 0; i < JL_GC_N_POOLS; i++) { + *pfl[t_i * JL_GC_N_POOLS + i] = NULL; + } } } @@ -1910,7 +1896,7 @@ static void *volatile gc_findval; // for usage from gdb, for finding the gc-root // Handle the case where the stack is only partially copied. STATIC_INLINE uintptr_t gc_get_stack_addr(void *_addr, uintptr_t offset, - uintptr_t lb, uintptr_t ub) + uintptr_t lb, uintptr_t ub) JL_NOTSAFEPOINT { uintptr_t addr = (uintptr_t)_addr; if (addr >= lb && addr < ub) @@ -1919,929 +1905,650 @@ STATIC_INLINE uintptr_t gc_get_stack_addr(void *_addr, uintptr_t offset, } STATIC_INLINE uintptr_t gc_read_stack(void *_addr, uintptr_t offset, - uintptr_t lb, uintptr_t ub) + uintptr_t lb, uintptr_t ub) JL_NOTSAFEPOINT { uintptr_t real_addr = gc_get_stack_addr(_addr, offset, lb, ub); return *(uintptr_t*)real_addr; } JL_NORETURN NOINLINE void gc_assert_datatype_fail(jl_ptls_t ptls, jl_datatype_t *vt, - jl_gc_mark_sp_t sp) + jl_gc_markqueue_t *mq) JL_NOTSAFEPOINT { jl_safe_printf("GC error (probable corruption) :\n"); jl_gc_debug_print_status(); jl_(vt); jl_gc_debug_critical_error(); - gc_mark_loop_unwind(ptls, sp, 0); abort(); } -// This stores the label address in the mark loop function. -// We can't directly store that to a global array so we need some hack to get that. -// See the call to `gc_mark_loop` in init with a `NULL` `ptls`. -void *gc_mark_label_addrs[_GC_MARK_L_MAX]; - -// Double the local mark stack (both pc and data) -static void NOINLINE gc_mark_stack_resize(jl_gc_mark_cache_t *gc_cache, jl_gc_mark_sp_t *sp) JL_NOTSAFEPOINT -{ - jl_gc_mark_data_t *old_data = gc_cache->data_stack; - void **pc_stack = sp->pc_start; - size_t stack_size = (char*)sp->pc_end - (char*)pc_stack; - ptrdiff_t datadiff = (char*)sp->data - (char*)old_data; - gc_cache->data_stack = (jl_gc_mark_data_t *)realloc_s(old_data, stack_size * 2 * sizeof(jl_gc_mark_data_t)); - sp->data = (jl_gc_mark_data_t *)((char*)gc_cache->data_stack + datadiff); - - sp->pc_start = gc_cache->pc_stack = (void**)realloc_s(pc_stack, stack_size * 2 * sizeof(void*)); - gc_cache->pc_stack_end = sp->pc_end = sp->pc_start + stack_size * 2; - sp->pc = sp->pc_start + (sp->pc - pc_stack); -} - -// Push a work item to the stack. The type of the work item is marked with `pc`. -// The data needed is in `data` and is of size `data_size`. -// If there isn't enough space on the stack, the stack will be resized with the stack -// lock held. The caller should invalidate any local cache of the stack addresses that's not -// in `gc_cache` or `sp` -// The `sp` will be updated on return if `inc` is true. -STATIC_INLINE void gc_mark_stack_push(jl_gc_mark_cache_t *gc_cache, jl_gc_mark_sp_t *sp, - void *pc, void *data, size_t data_size, int inc) JL_NOTSAFEPOINT +// Check if `nptr` is tagged for `old + refyoung`, +// Push the object to the remset and update the `nptr` counter if necessary. +STATIC_INLINE void gc_mark_push_remset(jl_ptls_t ptls, jl_value_t *obj, + uintptr_t nptr) JL_NOTSAFEPOINT { - assert(data_size <= sizeof(jl_gc_mark_data_t)); - if (__unlikely(sp->pc == sp->pc_end)) - gc_mark_stack_resize(gc_cache, sp); - *sp->pc = pc; - memcpy(sp->data, data, data_size); - if (inc) { - sp->data = (jl_gc_mark_data_t *)(((char*)sp->data) + data_size); - sp->pc++; + if (__unlikely((nptr & 0x3) == 0x3)) { + ptls->heap.remset_nptr += nptr >> 2; + arraylist_t *remset = ptls->heap.remset; + size_t len = remset->len; + if (__unlikely(len >= remset->max)) { + arraylist_push(remset, obj); + } + else { + remset->len = len + 1; + remset->items[len] = obj; + } } } -// Check if the reference is non-NULL and atomically set the mark bit. -// Update `*nptr`, which is the `nptr` field of the parent item, if the object is young. -// Return the tag (with GC bits cleared) and the GC bits in `*ptag` and `*pbits`. -// Return whether the object needs to be scanned / have metadata updated. -STATIC_INLINE int gc_try_setmark(jl_value_t *obj, uintptr_t *nptr, - uintptr_t *ptag, uint8_t *pbits) JL_NOTSAFEPOINT +// Push a work item to the queue +STATIC_INLINE void gc_ptr_queue_push(jl_gc_markqueue_t *mq, jl_value_t *obj) JL_NOTSAFEPOINT { - if (!obj) - return 0; - jl_taggedvalue_t *o = jl_astaggedvalue(obj); - uintptr_t tag = o->header; - if (!gc_marked(tag)) { - uint8_t bits; - int res = gc_setmark_tag(o, GC_MARKED, tag, &bits); - if (!gc_old(bits)) - *nptr = *nptr | 1; - *ptag = tag & ~(uintptr_t)0xf; - *pbits = bits; - return __likely(res); - } - else if (!gc_old(tag)) { - *nptr = *nptr | 1; - } - return 0; + ws_array_t *old_a = ws_queue_push(&mq->ptr_queue, &obj, sizeof(jl_value_t*)); + // Put `old_a` in `reclaim_set` to be freed after the mark phase + if (__unlikely(old_a != NULL)) + arraylist_push(&mq->reclaim_set, old_a); } -// Queue a finalizer list to be scanned in the mark loop. Start marking from index `start`. -void gc_mark_queue_finlist(jl_gc_mark_cache_t *gc_cache, jl_gc_mark_sp_t *sp, - arraylist_t *list, size_t start) +// Pop from the mark queue +STATIC_INLINE jl_value_t *gc_ptr_queue_pop(jl_gc_markqueue_t *mq) JL_NOTSAFEPOINT { - size_t len = list->len; - if (len <= start) - return; - jl_value_t **items = (jl_value_t**)list->items; - gc_mark_finlist_t markdata = {items + start, items + len}; - gc_mark_stack_push(gc_cache, sp, gc_mark_label_addrs[GC_MARK_L_finlist], - &markdata, sizeof(markdata), 1); + jl_value_t *v = NULL; + ws_queue_pop(&mq->ptr_queue, &v, sizeof(jl_value_t*)); + return v; } -// Queue a object to be scanned. The object should already be marked and the GC metadata -// should already be updated for it. Only scanning of the object should be performed. -STATIC_INLINE void gc_mark_queue_scan_obj(jl_gc_mark_cache_t *gc_cache, jl_gc_mark_sp_t *sp, - jl_value_t *obj) +// Steal from `mq2` +STATIC_INLINE jl_value_t *gc_ptr_queue_steal_from(jl_gc_markqueue_t *mq2) JL_NOTSAFEPOINT { - jl_taggedvalue_t *o = jl_astaggedvalue(obj); - uintptr_t tag = o->header; - uint8_t bits = tag & 0xf; - tag = tag & ~(uintptr_t)0xf; - gc_mark_marked_obj_t data = {obj, tag, bits}; - gc_mark_stack_push(gc_cache, sp, gc_mark_label_addrs[GC_MARK_L_scan_only], - &data, sizeof(data), 1); + jl_value_t *v = NULL; + ws_queue_steal_from(&mq2->ptr_queue, &v, sizeof(jl_value_t*)); + return v; } -// Mark and queue a object to be scanned. -// The object will be marked atomically which can also happen concurrently. -// It will be queued if the object wasn't marked already (or concurrently by another thread) -// Returns whether the object is young. -STATIC_INLINE int gc_mark_queue_obj(jl_gc_mark_cache_t *gc_cache, jl_gc_mark_sp_t *sp, void *_obj) JL_NOTSAFEPOINT +// Push chunk `*c` into chunk queue +STATIC_INLINE void gc_chunkqueue_push(jl_gc_markqueue_t *mq, jl_gc_chunk_t *c) JL_NOTSAFEPOINT { - jl_value_t *obj = (jl_value_t*)jl_assume(_obj); - uintptr_t nptr = 0; - uintptr_t tag = 0; - uint8_t bits = 0; - if (!gc_try_setmark(obj, &nptr, &tag, &bits)) - return (int)nptr; - gc_mark_marked_obj_t data = {obj, tag, bits}; - gc_mark_stack_push(gc_cache, sp, gc_mark_label_addrs[GC_MARK_L_marked_obj], - &data, sizeof(data), 1); - return (int)nptr; + ws_array_t *old_a = ws_queue_push(&mq->chunk_queue, c, sizeof(jl_gc_chunk_t)); + // Put `old_a` in `reclaim_set` to be freed after the mark phase + if (__unlikely(old_a != NULL)) + arraylist_push(&mq->reclaim_set, old_a); } -int jl_gc_mark_queue_obj_explicit(jl_gc_mark_cache_t *gc_cache, jl_gc_mark_sp_t *sp, jl_value_t *obj) +// Pop chunk from chunk queue +STATIC_INLINE jl_gc_chunk_t gc_chunkqueue_pop(jl_gc_markqueue_t *mq) JL_NOTSAFEPOINT { - return gc_mark_queue_obj(gc_cache, sp, obj); + jl_gc_chunk_t c = {.cid = GC_empty_chunk}; + ws_queue_pop(&mq->chunk_queue, &c, sizeof(jl_gc_chunk_t)); + return c; } -JL_DLLEXPORT int jl_gc_mark_queue_obj(jl_ptls_t ptls, jl_value_t *obj) +// Steal chunk from `mq2` +STATIC_INLINE jl_gc_chunk_t gc_chunkqueue_steal_from(jl_gc_markqueue_t *mq2) JL_NOTSAFEPOINT { - return gc_mark_queue_obj(&ptls->gc_cache, &ptls->gc_mark_sp, obj); + jl_gc_chunk_t c = {.cid = GC_empty_chunk}; + ws_queue_steal_from(&mq2->chunk_queue, &c, sizeof(jl_gc_chunk_t)); + return c; } -JL_DLLEXPORT void jl_gc_mark_queue_objarray(jl_ptls_t ptls, jl_value_t *parent, - jl_value_t **objs, size_t nobjs) +// Enqueue an unmarked obj. last bit of `nptr` is set if `_obj` is young +STATIC_INLINE void gc_try_claim_and_push(jl_gc_markqueue_t *mq, void *_obj, + uintptr_t *nptr) JL_NOTSAFEPOINT { - gc_mark_objarray_t data = { parent, objs, objs + nobjs, 1, - jl_astaggedvalue(parent)->bits.gc & 2 }; - gc_mark_stack_push(&ptls->gc_cache, &ptls->gc_mark_sp, - gc_mark_label_addrs[GC_MARK_L_objarray], - &data, sizeof(data), 1); + if (_obj == NULL) + return; + jl_value_t *obj = (jl_value_t *)jl_assume(_obj); + jl_taggedvalue_t *o = jl_astaggedvalue(obj); + if (!gc_old(o->header) && nptr) + *nptr |= 1; + if (gc_try_setmark_tag(o, GC_MARKED)) + gc_ptr_queue_push(mq, obj); } - -// Check if `nptr` is tagged for `old + refyoung`, -// Push the object to the remset and update the `nptr` counter if necessary. -STATIC_INLINE void gc_mark_push_remset(jl_ptls_t ptls, jl_value_t *obj, uintptr_t nptr) JL_NOTSAFEPOINT +// Mark object with 8bit field descriptors +STATIC_INLINE jl_value_t *gc_mark_obj8(jl_ptls_t ptls, char *obj8_parent, uint8_t *obj8_begin, + uint8_t *obj8_end, uintptr_t nptr) JL_NOTSAFEPOINT { - if (__unlikely((nptr & 0x3) == 0x3)) { - ptls->heap.remset_nptr += nptr >> 2; - arraylist_t *remset = ptls->heap.remset; - size_t len = remset->len; - if (__unlikely(len >= remset->max)) { - arraylist_push(remset, obj); - } - else { - remset->len = len + 1; - remset->items[len] = obj; + (void)jl_assume(obj8_begin < obj8_end); + jl_gc_markqueue_t *mq = &ptls->mark_queue; + jl_value_t **slot = NULL; + jl_value_t *new_obj = NULL; + for (; obj8_begin < obj8_end; obj8_begin++) { + slot = &((jl_value_t**)obj8_parent)[*obj8_begin]; + new_obj = *slot; + if (new_obj != NULL) { + verify_parent2("object", obj8_parent, slot, "field(%d)", + gc_slot_to_fieldidx(obj8_parent, slot, (jl_datatype_t*)jl_typeof(obj8_parent))); + if (obj8_begin + 1 != obj8_end) { + gc_try_claim_and_push(mq, new_obj, &nptr); + } + else { + // Unroll marking of last item to avoid pushing + // and popping it right away + jl_taggedvalue_t *o = jl_astaggedvalue(new_obj); + nptr |= !gc_old(o->header); + if (!gc_try_setmark_tag(o, GC_MARKED)) new_obj = NULL; + } + gc_heap_snapshot_record_object_edge((jl_value_t*)obj8_parent, slot); } } + gc_mark_push_remset(ptls, (jl_value_t *)obj8_parent, nptr); + return new_obj; } -// Scan a dense array of object references, see `gc_mark_objarray_t` -STATIC_INLINE int gc_mark_scan_objarray(jl_ptls_t ptls, jl_gc_mark_sp_t *sp, - gc_mark_objarray_t *objary, - jl_value_t **begin, jl_value_t **end, - jl_value_t **pnew_obj, uintptr_t *ptag, uint8_t *pbits) +// Mark object with 16bit field descriptors +STATIC_INLINE jl_value_t *gc_mark_obj16(jl_ptls_t ptls, char *obj16_parent, uint16_t *obj16_begin, + uint16_t *obj16_end, uintptr_t nptr) JL_NOTSAFEPOINT { - (void)jl_assume(objary == (gc_mark_objarray_t*)sp->data); - for (; begin < end; begin += objary->step) { - *pnew_obj = *begin; - if (*pnew_obj) { - verify_parent2("obj array", objary->parent, begin, "elem(%d)", - gc_slot_to_arrayidx(objary->parent, begin)); - gc_heap_snapshot_record_array_edge(objary->parent, begin); - } - if (!gc_try_setmark(*pnew_obj, &objary->nptr, ptag, pbits)) - continue; - begin += objary->step; - // Found an object to mark - if (begin < end) { - // Haven't done with this one yet. Update the content and push it back - objary->begin = begin; - gc_repush_markdata(sp, gc_mark_objarray_t); - } - else { - // Finished scanning this one, finish up by checking the GC invariance - // and let the next item replacing the current one directly. - gc_mark_push_remset(ptls, objary->parent, objary->nptr); - } - return 1; - } - gc_mark_push_remset(ptls, objary->parent, objary->nptr); - return 0; -} - -// Scan a sparse array of object references, see `gc_mark_objarray_t` -STATIC_INLINE int gc_mark_scan_array8(jl_ptls_t ptls, jl_gc_mark_sp_t *sp, - gc_mark_array8_t *ary8, - jl_value_t **begin, jl_value_t **end, - uint8_t *elem_begin, uint8_t *elem_end, - jl_value_t **pnew_obj, uintptr_t *ptag, uint8_t *pbits) -{ - (void)jl_assume(ary8 == (gc_mark_array8_t*)sp->data); - size_t elsize = ((jl_array_t*)ary8->elem.parent)->elsize / sizeof(jl_value_t*); - for (; begin < end; begin += elsize) { - for (; elem_begin < elem_end; elem_begin++) { - jl_value_t **slot = &begin[*elem_begin]; - *pnew_obj = *slot; - if (*pnew_obj) { - verify_parent2("array", ary8->elem.parent, slot, "elem(%d)", - gc_slot_to_arrayidx(ary8->elem.parent, begin)); - gc_heap_snapshot_record_array_edge(ary8->elem.parent, slot); - } - if (!gc_try_setmark(*pnew_obj, &ary8->elem.nptr, ptag, pbits)) - continue; - elem_begin++; - // Found an object to mark - if (elem_begin < elem_end) { - // Haven't done with this one yet. Update the content and push it back - ary8->elem.begin = elem_begin; - ary8->begin = begin; - gc_repush_markdata(sp, gc_mark_array8_t); + (void)jl_assume(obj16_begin < obj16_end); + jl_gc_markqueue_t *mq = &ptls->mark_queue; + jl_value_t **slot = NULL; + jl_value_t *new_obj = NULL; + for (; obj16_begin < obj16_end; obj16_begin++) { + slot = &((jl_value_t **)obj16_parent)[*obj16_begin]; + new_obj = *slot; + if (new_obj != NULL) { + verify_parent2("object", obj16_parent, slot, "field(%d)", + gc_slot_to_fieldidx(obj16_parent, slot, (jl_datatype_t*)jl_typeof(obj16_parent))); + gc_try_claim_and_push(mq, new_obj, &nptr); + if (obj16_begin + 1 != obj16_end) { + gc_try_claim_and_push(mq, new_obj, &nptr); } else { - begin += elsize; - if (begin < end) { - // Haven't done with this array yet. Reset the content and push it back - ary8->elem.begin = ary8->rebegin; - ary8->begin = begin; - gc_repush_markdata(sp, gc_mark_array8_t); - } - else { - // Finished scanning this one, finish up by checking the GC invariance - // and let the next item replacing the current one directly. - gc_mark_push_remset(ptls, ary8->elem.parent, ary8->elem.nptr); - } + // Unroll marking of last item to avoid pushing + // and popping it right away + jl_taggedvalue_t *o = jl_astaggedvalue(new_obj); + nptr |= !gc_old(o->header); + if (!gc_try_setmark_tag(o, GC_MARKED)) new_obj = NULL; } - return 1; + gc_heap_snapshot_record_object_edge((jl_value_t*)obj16_parent, slot); } - elem_begin = ary8->rebegin; - } - gc_mark_push_remset(ptls, ary8->elem.parent, ary8->elem.nptr); - return 0; -} - -// Scan a sparse array of object references, see `gc_mark_objarray_t` -STATIC_INLINE int gc_mark_scan_array16(jl_ptls_t ptls, jl_gc_mark_sp_t *sp, - gc_mark_array16_t *ary16, - jl_value_t **begin, jl_value_t **end, - uint16_t *elem_begin, uint16_t *elem_end, - jl_value_t **pnew_obj, uintptr_t *ptag, uint8_t *pbits) -{ - (void)jl_assume(ary16 == (gc_mark_array16_t*)sp->data); - size_t elsize = ((jl_array_t*)ary16->elem.parent)->elsize / sizeof(jl_value_t*); - for (; begin < end; begin += elsize) { - for (; elem_begin < elem_end; elem_begin++) { - jl_value_t **slot = &begin[*elem_begin]; - *pnew_obj = *slot; - if (*pnew_obj) { - verify_parent2("array", ary16->elem.parent, slot, "elem(%d)", - gc_slot_to_arrayidx(ary16->elem.parent, begin)); - gc_heap_snapshot_record_array_edge(ary16->elem.parent, slot); - } - if (!gc_try_setmark(*pnew_obj, &ary16->elem.nptr, ptag, pbits)) - continue; - elem_begin++; - // Found an object to mark - if (elem_begin < elem_end) { - // Haven't done with this one yet. Update the content and push it back - ary16->elem.begin = elem_begin; - ary16->begin = begin; - gc_repush_markdata(sp, gc_mark_array16_t); + } + gc_mark_push_remset(ptls, (jl_value_t *)obj16_parent, nptr); + return new_obj; +} + +// Mark object with 32bit field descriptors +STATIC_INLINE jl_value_t *gc_mark_obj32(jl_ptls_t ptls, char *obj32_parent, uint32_t *obj32_begin, + uint32_t *obj32_end, uintptr_t nptr) JL_NOTSAFEPOINT +{ + (void)jl_assume(obj32_begin < obj32_end); + jl_gc_markqueue_t *mq = &ptls->mark_queue; + jl_value_t **slot = NULL; + jl_value_t *new_obj = NULL; + for (; obj32_begin < obj32_end; obj32_begin++) { + slot = &((jl_value_t **)obj32_parent)[*obj32_begin]; + new_obj = *slot; + if (new_obj != NULL) { + verify_parent2("object", obj32_parent, slot, "field(%d)", + gc_slot_to_fieldidx(obj32_parent, slot, (jl_datatype_t*)jl_typeof(obj32_parent))); + if (obj32_begin + 1 != obj32_end) { + gc_try_claim_and_push(mq, new_obj, &nptr); } else { - begin += elsize; - if (begin < end) { - // Haven't done with this array yet. Reset the content and push it back - ary16->elem.begin = ary16->rebegin; - ary16->begin = begin; - gc_repush_markdata(sp, gc_mark_array16_t); - } - else { - // Finished scanning this one, finish up by checking the GC invariance - // and let the next item replacing the current one directly. - gc_mark_push_remset(ptls, ary16->elem.parent, ary16->elem.nptr); - } + // Unroll marking of last item to avoid pushing + // and popping it right away + jl_taggedvalue_t *o = jl_astaggedvalue(new_obj); + nptr |= !gc_old(o->header); + if (!gc_try_setmark_tag(o, GC_MARKED)) new_obj = NULL; } - return 1; + gc_heap_snapshot_record_object_edge((jl_value_t*)obj32_parent, slot); } - elem_begin = ary16->rebegin; } - gc_mark_push_remset(ptls, ary16->elem.parent, ary16->elem.nptr); - return 0; -} - - -// Scan an object with 8bits field descriptors. see `gc_mark_obj8_t` -STATIC_INLINE int gc_mark_scan_obj8(jl_ptls_t ptls, jl_gc_mark_sp_t *sp, gc_mark_obj8_t *obj8, - char *parent, uint8_t *begin, uint8_t *end, - jl_value_t **pnew_obj, uintptr_t *ptag, uint8_t *pbits) -{ - (void)jl_assume(obj8 == (gc_mark_obj8_t*)sp->data); - (void)jl_assume(begin < end); - for (; begin < end; begin++) { - jl_value_t **slot = &((jl_value_t**)parent)[*begin]; - *pnew_obj = *slot; - if (*pnew_obj) { - verify_parent2("object", parent, slot, "field(%d)", - gc_slot_to_fieldidx(parent, slot, (jl_datatype_t*)jl_typeof(parent))); - gc_heap_snapshot_record_object_edge((jl_value_t*)parent, slot); + return new_obj; +} + +// Mark object array +STATIC_INLINE void gc_mark_objarray(jl_ptls_t ptls, jl_value_t *obj_parent, jl_value_t **obj_begin, + jl_value_t **obj_end, uint32_t step, uintptr_t nptr) JL_NOTSAFEPOINT +{ + jl_gc_markqueue_t *mq = &ptls->mark_queue; + jl_value_t *new_obj; + // Decide whether need to chunk objary + (void)jl_assume(step > 0); + if ((nptr & 0x2) == 0x2) { + // pre-scan this object: most of this object should be old, so look for + // the first young object before starting this chunk + // (this also would be valid for young objects, but probably less beneficial) + for (; obj_begin < obj_end; obj_begin += step) { + new_obj = *obj_begin; + if (new_obj != NULL) { + verify_parent2("obj array", obj_parent, obj_begin, "elem(%d)", + gc_slot_to_arrayidx(obj_parent, obj_begin)); + jl_taggedvalue_t *o = jl_astaggedvalue(new_obj); + if (!gc_old(o->header)) + nptr |= 1; + if (!gc_marked(o->header)) + break; + gc_heap_snapshot_record_array_edge(obj_parent, &new_obj); + } } - if (!gc_try_setmark(*pnew_obj, &obj8->nptr, ptag, pbits)) - continue; - begin++; - // Found an object to mark - if (begin < end) { - // Haven't done with this one yet. Update the content and push it back - obj8->begin = begin; - gc_repush_markdata(sp, gc_mark_obj8_t); + } + size_t too_big = (obj_end - obj_begin) / GC_CHUNK_BATCH_SIZE > step; // use this order of operations to avoid idiv + jl_value_t **scan_end = obj_end; + int pushed_chunk = 0; + if (too_big) { + scan_end = obj_begin + step * GC_CHUNK_BATCH_SIZE; + // case 1: array owner is young, so we won't need to scan through all its elements + // to know that we will never need to push it to the remset. it's fine + // to create a chunk with "incorrect" `nptr` and push it to the chunk-queue + // ASAP in order to expose as much parallelism as possible + // case 2: lowest two bits of `nptr` are already set to 0x3, so won't change after + // scanning the array elements + if ((nptr & 0x2) != 0x2 || (nptr & 0x3) == 0x3) { + jl_gc_chunk_t c = {GC_objary_chunk, obj_parent, scan_end, obj_end, NULL, NULL, step, nptr}; + gc_chunkqueue_push(mq, &c); + pushed_chunk = 1; } - else { - // Finished scanning this one, finish up by checking the GC invariance - // and let the next item replacing the current one directly. - gc_mark_push_remset(ptls, obj8->parent, obj8->nptr); + } + for (; obj_begin < scan_end; obj_begin += step) { + new_obj = *obj_begin; + if (new_obj != NULL) { + verify_parent2("obj array", obj_parent, obj_begin, "elem(%d)", + gc_slot_to_arrayidx(obj_parent, obj_begin)); + gc_try_claim_and_push(mq, new_obj, &nptr); + gc_heap_snapshot_record_array_edge(obj_parent, &new_obj); } - return 1; - } - gc_mark_push_remset(ptls, obj8->parent, obj8->nptr); - return 0; -} - -// Scan an object with 16bits field descriptors. see `gc_mark_obj16_t` -STATIC_INLINE int gc_mark_scan_obj16(jl_ptls_t ptls, jl_gc_mark_sp_t *sp, gc_mark_obj16_t *obj16, - char *parent, uint16_t *begin, uint16_t *end, - jl_value_t **pnew_obj, uintptr_t *ptag, uint8_t *pbits) JL_NOTSAFEPOINT -{ - (void)jl_assume(obj16 == (gc_mark_obj16_t*)sp->data); - (void)jl_assume(begin < end); - for (; begin < end; begin++) { - jl_value_t **slot = &((jl_value_t**)parent)[*begin]; - *pnew_obj = *slot; - if (*pnew_obj) { - verify_parent2("object", parent, slot, "field(%d)", - gc_slot_to_fieldidx(parent, slot, (jl_datatype_t*)jl_typeof(parent))); - gc_heap_snapshot_record_object_edge((jl_value_t*)parent, slot); + } + if (too_big) { + if (!pushed_chunk) { + jl_gc_chunk_t c = {GC_objary_chunk, obj_parent, scan_end, obj_end, NULL, NULL, step, nptr}; + gc_chunkqueue_push(mq, &c); } - if (!gc_try_setmark(*pnew_obj, &obj16->nptr, ptag, pbits)) - continue; - begin++; - // Found an object to mark - if (begin < end) { - // Haven't done with this one yet. Update the content and push it back - obj16->begin = begin; - gc_repush_markdata(sp, gc_mark_obj16_t); + } + else { + gc_mark_push_remset(ptls, obj_parent, nptr); + } +} + +// Mark array with 8bit field descriptors +STATIC_INLINE void gc_mark_array8(jl_ptls_t ptls, jl_value_t *ary8_parent, jl_value_t **ary8_begin, + jl_value_t **ary8_end, uint8_t *elem_begin, uint8_t *elem_end, + uintptr_t nptr) JL_NOTSAFEPOINT +{ + jl_gc_markqueue_t *mq = &ptls->mark_queue; + jl_value_t *new_obj; + size_t elsize = ((jl_array_t *)ary8_parent)->elsize / sizeof(jl_value_t *); + assert(elsize > 0); + // Decide whether need to chunk objary + if ((nptr & 0x2) == 0x2) { + // pre-scan this object: most of this object should be old, so look for + // the first young object before starting this chunk + // (this also would be valid for young objects, but probably less beneficial) + for (; ary8_begin < ary8_end; ary8_begin += elsize) { + int early_end = 0; + for (uint8_t *pindex = elem_begin; pindex < elem_end; pindex++) { + new_obj = ary8_begin[*pindex]; + if (new_obj != NULL) { + verify_parent2("array", ary8_parent, &new_obj, "elem(%d)", + gc_slot_to_arrayidx(ary8_parent, ary8_begin)); + jl_taggedvalue_t *o = jl_astaggedvalue(new_obj); + if (!gc_old(o->header)) + nptr |= 1; + if (!gc_marked(o->header)){ + early_end = 1; + break; + } + gc_heap_snapshot_record_array_edge(ary8_parent, &new_obj); + } + } + if (early_end) + break; } - else { - // Finished scanning this one, finish up by checking the GC invariance - // and let the next item replacing the current one directly. - gc_mark_push_remset(ptls, obj16->parent, obj16->nptr); + } + size_t too_big = (ary8_end - ary8_begin) / GC_CHUNK_BATCH_SIZE > elsize; // use this order of operations to avoid idiv + jl_value_t **scan_end = ary8_end; + int pushed_chunk = 0; + if (too_big) { + scan_end = ary8_begin + elsize * GC_CHUNK_BATCH_SIZE; + // case 1: array owner is young, so we won't need to scan through all its elements + // to know that we will never need to push it to the remset. it's fine + // to create a chunk with "incorrect" `nptr` and push it to the chunk-queue + // ASAP in order to expose as much parallelism as possible + // case 2: lowest two bits of `nptr` are already set to 0x3, so won't change after + // scanning the array elements + if ((nptr & 0x2) != 0x2 || (nptr & 0x3) == 0x3) { + jl_gc_chunk_t c = {GC_ary8_chunk, ary8_parent, scan_end, ary8_end, elem_begin, elem_end, 0, nptr}; + gc_chunkqueue_push(mq, &c); + pushed_chunk = 1; } - return 1; - } - gc_mark_push_remset(ptls, obj16->parent, obj16->nptr); - return 0; -} - -// Scan an object with 32bits field descriptors. see `gc_mark_obj32_t` -STATIC_INLINE int gc_mark_scan_obj32(jl_ptls_t ptls, jl_gc_mark_sp_t *sp, gc_mark_obj32_t *obj32, - char *parent, uint32_t *begin, uint32_t *end, - jl_value_t **pnew_obj, uintptr_t *ptag, uint8_t *pbits) -{ - (void)jl_assume(obj32 == (gc_mark_obj32_t*)sp->data); - (void)jl_assume(begin < end); - for (; begin < end; begin++) { - jl_value_t **slot = &((jl_value_t**)parent)[*begin]; - *pnew_obj = *slot; - if (*pnew_obj) { - verify_parent2("object", parent, slot, "field(%d)", - gc_slot_to_fieldidx(parent, slot, (jl_datatype_t*)jl_typeof(parent))); - gc_heap_snapshot_record_object_edge((jl_value_t*)parent, slot); + } + for (; ary8_begin < ary8_end; ary8_begin += elsize) { + for (uint8_t *pindex = elem_begin; pindex < elem_end; pindex++) { + new_obj = ary8_begin[*pindex]; + if (new_obj != NULL) { + verify_parent2("array", ary8_parent, &new_obj, "elem(%d)", + gc_slot_to_arrayidx(ary8_parent, ary8_begin)); + gc_try_claim_and_push(mq, new_obj, &nptr); + gc_heap_snapshot_record_array_edge(ary8_parent, &new_obj); + } } - if (!gc_try_setmark(*pnew_obj, &obj32->nptr, ptag, pbits)) - continue; - begin++; - // Found an object to mark - if (begin < end) { - // Haven't done with this one yet. Update the content and push it back - obj32->begin = begin; - gc_repush_markdata(sp, gc_mark_obj32_t); + } + if (too_big) { + if (!pushed_chunk) { + jl_gc_chunk_t c = {GC_ary8_chunk, ary8_parent, scan_end, ary8_end, elem_begin, elem_end, 0, nptr}; + gc_chunkqueue_push(mq, &c); } - else { - // Finished scanning this one, finish up by checking the GC invariance - // and let the next item replacing the current one directly. - gc_mark_push_remset(ptls, obj32->parent, obj32->nptr); + } + else { + gc_mark_push_remset(ptls, ary8_parent, nptr); + } +} + +// Mark array with 16bit field descriptors +STATIC_INLINE void gc_mark_array16(jl_ptls_t ptls, jl_value_t *ary16_parent, jl_value_t **ary16_begin, + jl_value_t **ary16_end, uint16_t *elem_begin, uint16_t *elem_end, + uintptr_t nptr) JL_NOTSAFEPOINT +{ + jl_gc_markqueue_t *mq = &ptls->mark_queue; + jl_value_t *new_obj; + size_t elsize = ((jl_array_t *)ary16_parent)->elsize / sizeof(jl_value_t *); + // Decide whether need to chunk ary16 + size_t too_big = (ary16_end - ary16_begin) / GC_CHUNK_BATCH_SIZE > elsize; // use this order of operations to avoid idiv + jl_value_t **scan_end = ary16_end; + int pushed_chunk = 0; + if (too_big) { + scan_end = ary16_begin + elsize * GC_CHUNK_BATCH_SIZE; + // case 1: array owner is young, so we won't need to scan through all its elements + // to know that we will never need to push it to the remset. it's fine + // to create a chunk with "incorrect" `nptr` and push it to the chunk-queue + // ASAP in order to expose as much parallelism as possible + // case 2: lowest two bits of `nptr` are already set to 0x3, so won't change after + // scanning the array elements + if ((nptr & 0x2) != 0x2 || (nptr & 0x3) == 0x3) { + jl_gc_chunk_t c = {GC_ary16_chunk, ary16_parent, scan_end, ary16_end, elem_begin, elem_end, elsize, nptr}; + gc_chunkqueue_push(mq, &c); + pushed_chunk = 1; } - return 1; } - gc_mark_push_remset(ptls, obj32->parent, obj32->nptr); - return 0; -} - -#if defined(__GNUC__) && !defined(_OS_EMSCRIPTEN_) -# define gc_mark_laddr(name) (&&name) -# define gc_mark_jmp(ptr) goto *(ptr) -#else -#define gc_mark_laddr(name) ((void*)(uintptr_t)GC_MARK_L_##name) -#define gc_mark_jmp(ptr) do { \ - switch ((int)(uintptr_t)ptr) { \ - case GC_MARK_L_marked_obj: \ - goto marked_obj; \ - case GC_MARK_L_scan_only: \ - goto scan_only; \ - case GC_MARK_L_finlist: \ - goto finlist; \ - case GC_MARK_L_objarray: \ - goto objarray; \ - case GC_MARK_L_array8: \ - goto array8; \ - case GC_MARK_L_array16: \ - goto array16; \ - case GC_MARK_L_obj8: \ - goto obj8; \ - case GC_MARK_L_obj16: \ - goto obj16; \ - case GC_MARK_L_obj32: \ - goto obj32; \ - case GC_MARK_L_stack: \ - goto stack; \ - case GC_MARK_L_excstack: \ - goto excstack; \ - case GC_MARK_L_module_binding: \ - goto module_binding; \ - default: \ - abort(); \ - } \ - } while (0) -#endif - -// This is the main marking loop. -// It uses an iterative (mostly) Depth-first search (DFS) to mark all the objects. -// Instead of using the native stack, two stacks are manually maintained, -// one (fixed-size) pc stack which stores the return address and one (variable-size) -// data stack which stores the local variables needed by the scanning code. -// Using a manually maintained stack has a few advantages -// -// 1. We can resize the stack as we go and never worry about stack overflow -// This is especitally useful when enters the GC in a deep call stack. -// It also removes the very deep GC call stack in a profile. -// 2. We can minimize the number of local variables to save on the stack. -// This includes minimizing the sizes of the stack frames and only saving variables -// that have been changed before making "function calls" (i.e. `goto mark;`) -// 3. We can perform end-of-loop tail-call optimization for common cases. -// 4. The marking can be interrupted more easily since all the states are maintained -// in a well-defined format already. -// This will be useful if we want to have incremental marking again. -// 5. The frames can be stolen by another thread more easily and it is not necessary -// to copy works to be stolen to another queue. Useful for parallel marking. -// (Will still require synchronization in stack popping of course.) -// 6. A flat function (i.e. no or very few function calls) also give the compiler -// opportunity to keep more states in registers that doesn't have to be spilled as often. -// -// We use two stacks so that the thief on another thread can steal the fixed sized pc stack -// and use that to figure out the size of the struct on the variable size data stack. -// -// The main disadvantages are that we bypass some stack-based CPU optimizations including the -// stack engine and return address prediction. -// Using two stacks also double the number of operations on the stack pointer -// though we still only need to use one of them (the pc stack pointer) for bounds check. -// In general, it seems that the reduction of stack memory ops and instructions count -// have a larger positive effect on the performance. =) - -// As a general guide we do not want to make non-inlined function calls in this function -// if possible since a large number of registers has to be spilled when that happens. -// This is especially true on on X86 which doesn't have many (any?) -// callee saved general purpose registers. -// (OTOH, the spill will likely make use of the stack engine which is otherwise idle so -// the performance impact is minimum as long as it's not in the hottest path) - -// There are three external entry points to the loop, corresponding to label -// `marked_obj`, `scan_only` and `finlist` (see the corresponding functions -// `gc_mark_queue_obj`, `gc_mark_queue_scan_obj` and `gc_mark_queue_finlist` above). -// The scanning of the object starts with `goto mark`, which updates the metadata and scans -// the object whose information is stored in `new_obj`, `tag` and `bits`. -// The branches in `mark` will dispatch the object to one of the scan "loop"s to be scanned -// as either a normal julia object or one of the special objects with specific storage format. -// Each of the scan "loop" will perform a DFS of the object in the following way -// -// 1. When encountering an pointer (julia object reference) slots, load, perform NULL check -// and atomically set the mark bits to determine if the object needs to be scanned. -// 2. If yes, it'll push itself back onto the mark stack (after updating fields that are changed) -// using `gc_repush_markdata` to increment the stack pointers. -// This step can also be replaced by a tail call by finishing up the marking of the current -// object when the end of the current object is reached. -// 3. Jump to `mark`. The marking of the current object will be resumed after the child is -// scanned by popping the stack frame back. -// -// Some of the special object scannings use BFS to simplify the code (Task and Module). - -// The jumps from the dispatch to the scan "loop"s are done by first pushing a frame -// to the stacks while only increment the data stack pointer before jumping to the loop -// This way the scan "loop" gets exactly what it expects after a stack pop. -// Additional optimizations are done for some of the common cases by skipping -// the unnecessary data stack pointer increment and the load from the stack -// (i.e. store to load forwarding). See `objary_loaded`, `obj8_loaded` and `obj16_loaded`. -JL_EXTENSION NOINLINE void gc_mark_loop(jl_ptls_t ptls, jl_gc_mark_sp_t sp) -{ - if (__unlikely(ptls == NULL)) { - gc_mark_label_addrs[GC_MARK_L_marked_obj] = gc_mark_laddr(marked_obj); - gc_mark_label_addrs[GC_MARK_L_scan_only] = gc_mark_laddr(scan_only); - gc_mark_label_addrs[GC_MARK_L_finlist] = gc_mark_laddr(finlist); - gc_mark_label_addrs[GC_MARK_L_objarray] = gc_mark_laddr(objarray); - gc_mark_label_addrs[GC_MARK_L_array8] = gc_mark_laddr(array8); - gc_mark_label_addrs[GC_MARK_L_array16] = gc_mark_laddr(array16); - gc_mark_label_addrs[GC_MARK_L_obj8] = gc_mark_laddr(obj8); - gc_mark_label_addrs[GC_MARK_L_obj16] = gc_mark_laddr(obj16); - gc_mark_label_addrs[GC_MARK_L_obj32] = gc_mark_laddr(obj32); - gc_mark_label_addrs[GC_MARK_L_stack] = gc_mark_laddr(stack); - gc_mark_label_addrs[GC_MARK_L_excstack] = gc_mark_laddr(excstack); - gc_mark_label_addrs[GC_MARK_L_module_binding] = gc_mark_laddr(module_binding); - return; + for (; ary16_begin < scan_end; ary16_begin += elsize) { + for (uint16_t *pindex = elem_begin; pindex < elem_end; pindex++) { + new_obj = ary16_begin[*pindex]; + if (new_obj != NULL) { + verify_parent2("array", ary16_parent, &new_obj, "elem(%d)", + gc_slot_to_arrayidx(ary16_parent, ary16_begin)); + gc_try_claim_and_push(mq, new_obj, &nptr); + gc_heap_snapshot_record_array_edge(ary16_parent, &new_obj); + } + } } - - jl_value_t *new_obj = NULL; - uintptr_t tag = 0; - uint8_t bits = 0; - int meta_updated = 0; - - gc_mark_objarray_t *objary; - jl_value_t **objary_begin; - jl_value_t **objary_end; - - gc_mark_array8_t *ary8; - gc_mark_array16_t *ary16; - - gc_mark_obj8_t *obj8; - char *obj8_parent; - uint8_t *obj8_begin; - uint8_t *obj8_end; - - gc_mark_obj16_t *obj16; - char *obj16_parent; - uint16_t *obj16_begin; - uint16_t *obj16_end; - -pop: - if (sp.pc == sp.pc_start) { - // TODO: stealing form another thread - return; + if (too_big) { + if (!pushed_chunk) { + jl_gc_chunk_t c = {GC_ary16_chunk, ary16_parent, scan_end, ary16_end, elem_begin, elem_end, elsize, nptr}; + gc_chunkqueue_push(mq, &c); + } } - sp.pc--; - gc_mark_jmp(*sp.pc); // computed goto - -marked_obj: { - // An object that has been marked and needs have metadata updated and scanned. - gc_mark_marked_obj_t *obj = gc_pop_markdata(&sp, gc_mark_marked_obj_t); - new_obj = obj->obj; - tag = obj->tag; - bits = obj->bits; - goto mark; - } - -scan_only: { - // An object that has been marked and needs to be scanned. - gc_mark_marked_obj_t *obj = gc_pop_markdata(&sp, gc_mark_marked_obj_t); - new_obj = obj->obj; - tag = obj->tag; - bits = obj->bits; - meta_updated = 1; - goto mark; - } - -objarray: - objary = gc_pop_markdata(&sp, gc_mark_objarray_t); - objary_begin = objary->begin; - objary_end = objary->end; -objarray_loaded: - if (gc_mark_scan_objarray(ptls, &sp, objary, objary_begin, objary_end, - &new_obj, &tag, &bits)) - goto mark; - goto pop; - -array8: - ary8 = gc_pop_markdata(&sp, gc_mark_array8_t); - objary_begin = ary8->begin; - objary_end = ary8->end; - obj8_begin = ary8->elem.begin; - obj8_end = ary8->elem.end; -array8_loaded: - if (gc_mark_scan_array8(ptls, &sp, ary8, objary_begin, objary_end, obj8_begin, obj8_end, - &new_obj, &tag, &bits)) - goto mark; - goto pop; - -array16: - ary16 = gc_pop_markdata(&sp, gc_mark_array16_t); - objary_begin = ary16->begin; - objary_end = ary16->end; - obj16_begin = ary16->elem.begin; - obj16_end = ary16->elem.end; -array16_loaded: - if (gc_mark_scan_array16(ptls, &sp, ary16, objary_begin, objary_end, obj16_begin, obj16_end, - &new_obj, &tag, &bits)) - goto mark; - goto pop; - -obj8: - obj8 = gc_pop_markdata(&sp, gc_mark_obj8_t); - obj8_parent = (char*)obj8->parent; - obj8_begin = obj8->begin; - obj8_end = obj8->end; -obj8_loaded: - if (gc_mark_scan_obj8(ptls, &sp, obj8, obj8_parent, obj8_begin, obj8_end, - &new_obj, &tag, &bits)) - goto mark; - goto pop; - -obj16: - obj16 = gc_pop_markdata(&sp, gc_mark_obj16_t); - obj16_parent = (char*)obj16->parent; - obj16_begin = obj16->begin; - obj16_end = obj16->end; -obj16_loaded: - if (gc_mark_scan_obj16(ptls, &sp, obj16, obj16_parent, obj16_begin, obj16_end, - &new_obj, &tag, &bits)) - goto mark; - goto pop; - -obj32: { - gc_mark_obj32_t *obj32 = gc_pop_markdata(&sp, gc_mark_obj32_t); - char *parent = (char*)obj32->parent; - uint32_t *begin = obj32->begin; - uint32_t *end = obj32->end; - if (gc_mark_scan_obj32(ptls, &sp, obj32, parent, begin, end, &new_obj, &tag, &bits)) - goto mark; - goto pop; + else { + gc_mark_push_remset(ptls, ary16_parent, nptr); } +} -stack: { - // Scan the stack. see `gc_mark_stackframe_t` - // The task object this stack belongs to is being scanned separately as a normal - // 8bit field descriptor object. - gc_mark_stackframe_t *stack = gc_pop_markdata(&sp, gc_mark_stackframe_t); - jl_gcframe_t *s = stack->s; - uint32_t i = stack->i; - uint32_t nroots = stack->nroots; - uintptr_t offset = stack->offset; - uintptr_t lb = stack->lb; - uintptr_t ub = stack->ub; - uint32_t nr = nroots >> 2; - uintptr_t nptr = 0; - while (1) { - jl_value_t ***rts = (jl_value_t***)(((void**)s) + 2); - for (; i < nr; i++) { - if (nroots & 1) { - void **slot = (void**)gc_read_stack(&rts[i], offset, lb, ub); - new_obj = (jl_value_t*)gc_read_stack(slot, offset, lb, ub); - } - else { - new_obj = (jl_value_t*)gc_read_stack(&rts[i], offset, lb, ub); - if (gc_ptr_tag(new_obj, 1)) { - // handle tagged pointers in finalizer list - new_obj = gc_ptr_clear_tag(new_obj, 1); - // skip over the finalizer fptr - i++; - } - if (gc_ptr_tag(new_obj, 2)) - continue; - } - if (!gc_try_setmark(new_obj, &nptr, &tag, &bits)) - continue; - gc_heap_snapshot_record_frame_to_object_edge(s, new_obj); - i++; - if (i < nr) { - // Haven't done with this one yet. Update the content and push it back - stack->i = i; - gc_repush_markdata(&sp, gc_mark_stackframe_t); - } - // TODO stack addresses needs copy stack handling - else if ((s = (jl_gcframe_t*)gc_read_stack(&s->prev, offset, lb, ub))) { - gc_heap_snapshot_record_frame_to_frame_edge(stack->s, s); - stack->s = s; - stack->i = 0; - uintptr_t new_nroots = gc_read_stack(&s->nroots, offset, lb, ub); - assert(new_nroots <= UINT32_MAX); - stack->nroots = (uint32_t)new_nroots; - gc_repush_markdata(&sp, gc_mark_stackframe_t); - } - goto mark; - } - s = (jl_gcframe_t*)gc_read_stack(&s->prev, offset, lb, ub); - // walk up one stack frame - if (s != 0) { - gc_heap_snapshot_record_frame_to_frame_edge(stack->s, s); - stack->s = s; - i = 0; - uintptr_t new_nroots = gc_read_stack(&s->nroots, offset, lb, ub); - assert(new_nroots <= UINT32_MAX); - nroots = stack->nroots = (uint32_t)new_nroots; - nr = nroots >> 2; - continue; - } - goto pop; +// Mark chunk of large array +STATIC_INLINE void gc_mark_chunk(jl_ptls_t ptls, jl_gc_markqueue_t *mq, jl_gc_chunk_t *c) JL_NOTSAFEPOINT +{ + switch (c->cid) { + case GC_objary_chunk: { + jl_value_t *obj_parent = c->parent; + jl_value_t **obj_begin = c->begin; + jl_value_t **obj_end = c->end; + uint32_t step = c->step; + uintptr_t nptr = c->nptr; + gc_mark_objarray(ptls, obj_parent, obj_begin, obj_end, step, + nptr); + break; } - } - -excstack: { - // Scan an exception stack - gc_mark_excstack_t *stackitr = gc_pop_markdata(&sp, gc_mark_excstack_t); - jl_excstack_t *excstack = stackitr->s; - size_t itr = stackitr->itr; - size_t bt_index = stackitr->bt_index; - size_t jlval_index = stackitr->jlval_index; - while (itr > 0) { - size_t bt_size = jl_excstack_bt_size(excstack, itr); - jl_bt_element_t *bt_data = jl_excstack_bt_data(excstack, itr); - for (; bt_index < bt_size; bt_index += jl_bt_entry_size(bt_data + bt_index)) { - jl_bt_element_t *bt_entry = bt_data + bt_index; - if (jl_bt_is_native(bt_entry)) - continue; - // Found an extended backtrace entry: iterate over any - // GC-managed values inside. - size_t njlvals = jl_bt_num_jlvals(bt_entry); - while (jlval_index < njlvals) { - new_obj = jl_bt_entry_jlvalue(bt_entry, jlval_index); - gc_heap_snapshot_record_frame_to_object_edge(bt_entry, new_obj); - uintptr_t nptr = 0; - jlval_index += 1; - if (gc_try_setmark(new_obj, &nptr, &tag, &bits)) { - stackitr->itr = itr; - stackitr->bt_index = bt_index; - stackitr->jlval_index = jlval_index; - gc_repush_markdata(&sp, gc_mark_excstack_t); - goto mark; - } - } - jlval_index = 0; - } - // The exception comes last - mark it - new_obj = jl_excstack_exception(excstack, itr); - gc_heap_snapshot_record_frame_to_object_edge(excstack, new_obj); - itr = jl_excstack_next(excstack, itr); - bt_index = 0; - jlval_index = 0; - uintptr_t nptr = 0; - if (gc_try_setmark(new_obj, &nptr, &tag, &bits)) { - stackitr->itr = itr; - stackitr->bt_index = bt_index; - stackitr->jlval_index = jlval_index; - gc_repush_markdata(&sp, gc_mark_excstack_t); - goto mark; - } + case GC_ary8_chunk: { + jl_value_t *ary8_parent = c->parent; + jl_value_t **ary8_begin = c->begin; + jl_value_t **ary8_end = c->end; + uint8_t *elem_begin = (uint8_t *)c->elem_begin; + uint8_t *elem_end = (uint8_t *)c->elem_end; + uintptr_t nptr = c->nptr; + gc_mark_array8(ptls, ary8_parent, ary8_begin, ary8_end, elem_begin, elem_end, + nptr); + break; + } + case GC_ary16_chunk: { + jl_value_t *ary16_parent = c->parent; + jl_value_t **ary16_begin = c->begin; + jl_value_t **ary16_end = c->end; + uint16_t *elem_begin = (uint16_t *)c->elem_begin; + uint16_t *elem_end = (uint16_t *)c->elem_end; + uintptr_t nptr = c->nptr; + gc_mark_array16(ptls, ary16_parent, ary16_begin, ary16_end, elem_begin, elem_end, + nptr); + break; + } + case GC_finlist_chunk: { + jl_value_t **fl_begin = c->begin; + jl_value_t **fl_end = c->end; + gc_mark_finlist_(mq, fl_begin, fl_end); + break; + } + default: { + // `empty-chunk` should be checked by caller + jl_safe_printf("GC internal error: chunk mismatch\n"); + abort(); } - goto pop; } +} -module_binding: { - // Scan a module. see `gc_mark_binding_t` - // Other fields of the module will be scanned after the bindings are scanned - gc_mark_binding_t *binding = gc_pop_markdata(&sp, gc_mark_binding_t); - jl_binding_t **begin = binding->begin; - jl_binding_t **end = binding->end; - uint8_t mbits = binding->bits; - for (; begin < end; begin += 2) { - jl_binding_t *b = *begin; - if (b == (jl_binding_t*)HT_NOTFOUND) - continue; - if (jl_object_in_image((jl_value_t*)b)) { - jl_taggedvalue_t *buf = jl_astaggedvalue(b); - uintptr_t tag = buf->header; - uint8_t bits; - if (!gc_marked(tag)) - gc_setmark_tag(buf, GC_OLD_MARKED, tag, &bits); +// Mark gc frame +STATIC_INLINE void gc_mark_stack(jl_ptls_t ptls, jl_gcframe_t *s, uint32_t nroots, uintptr_t offset, + uintptr_t lb, uintptr_t ub) JL_NOTSAFEPOINT +{ + jl_gc_markqueue_t *mq = &ptls->mark_queue; + jl_value_t *new_obj; + uint32_t nr = nroots >> 2; + while (1) { + jl_value_t ***rts = (jl_value_t ***)(((void **)s) + 2); + for (uint32_t i = 0; i < nr; i++) { + if (nroots & 1) { + void **slot = (void **)gc_read_stack(&rts[i], offset, lb, ub); + new_obj = (jl_value_t *)gc_read_stack(slot, offset, lb, ub); } else { - gc_setmark_buf_(ptls, b, mbits, sizeof(jl_binding_t)); - } - void *vb = jl_astaggedvalue(b); - verify_parent1("module", binding->parent, &vb, "binding_buff"); - // Record the size used for the box for non-const bindings - gc_heap_snapshot_record_module_to_binding(binding->parent, b); - (void)vb; - jl_value_t *ty = jl_atomic_load_relaxed(&b->ty); - if (ty && ty != (jl_value_t*)jl_any_type) { - verify_parent2("module", binding->parent, - &b->ty, "binding(%s)", jl_symbol_name(b->name)); - if (gc_try_setmark(ty, &binding->nptr, &tag, &bits)) { - new_obj = ty; - gc_repush_markdata(&sp, gc_mark_binding_t); - goto mark; - } - } - jl_value_t *value = jl_atomic_load_relaxed(&b->value); - jl_value_t *globalref = jl_atomic_load_relaxed(&b->globalref); - if (value) { - verify_parent2("module", binding->parent, - &b->value, "binding(%s)", jl_symbol_name(b->name)); - if (gc_try_setmark(value, &binding->nptr, &tag, &bits)) { - new_obj = value; - begin += 2; - binding->begin = begin; - gc_repush_markdata(&sp, gc_mark_binding_t); - uintptr_t gr_tag; - uint8_t gr_bits; - if (gc_try_setmark(globalref, &binding->nptr, &gr_tag, &gr_bits)) { - gc_mark_marked_obj_t data = {globalref, gr_tag, gr_bits}; - gc_mark_stack_push(&ptls->gc_cache, &sp, gc_mark_laddr(marked_obj), - &data, sizeof(data), 1); - } - goto mark; + new_obj = (jl_value_t *)gc_read_stack(&rts[i], offset, lb, ub); + if (gc_ptr_tag(new_obj, 1)) { + // handle tagged pointers in finalizer list + new_obj = (jl_value_t *)gc_ptr_clear_tag(new_obj, 1); + // skip over the finalizer fptr + i++; } + if (gc_ptr_tag(new_obj, 2)) + continue; } - if (gc_try_setmark(globalref, &binding->nptr, &tag, &bits)) { - begin += 2; - binding->begin = begin; - gc_repush_markdata(&sp, gc_mark_binding_t); - new_obj = globalref; - goto mark; + if (new_obj != NULL) { + gc_try_claim_and_push(mq, new_obj, NULL); + gc_heap_snapshot_record_frame_to_object_edge(s, new_obj); } } - jl_module_t *m = binding->parent; - int scanparent = gc_try_setmark((jl_value_t*)m->parent, &binding->nptr, &tag, &bits); - size_t nusings = m->usings.len; - if (nusings) { - // this is only necessary because bindings for "using" modules - // are added only when accessed. therefore if a module is replaced - // after "using" it but before accessing it, this array might - // contain the only reference. - objary_begin = (jl_value_t**)m->usings.items; - objary_end = objary_begin + nusings; - gc_mark_objarray_t data = {(jl_value_t*)m, objary_begin, objary_end, 1, binding->nptr}; - gc_mark_stack_push(&ptls->gc_cache, &sp, gc_mark_laddr(objarray), - &data, sizeof(data), 0); - if (!scanparent) { - objary = (gc_mark_objarray_t*)sp.data; - goto objarray_loaded; + jl_gcframe_t *sprev = (jl_gcframe_t *)gc_read_stack(&s->prev, offset, lb, ub); + if (sprev == NULL) + break; + gc_heap_snapshot_record_frame_to_frame_edge(s, sprev); + s = sprev; + uintptr_t new_nroots = gc_read_stack(&s->nroots, offset, lb, ub); + assert(new_nroots <= UINT32_MAX); + nroots = (uint32_t)new_nroots; + nr = nroots >> 2; + } +} + +// Mark exception stack +STATIC_INLINE void gc_mark_excstack(jl_ptls_t ptls, jl_excstack_t *excstack, size_t itr) JL_NOTSAFEPOINT +{ + jl_gc_markqueue_t *mq = &ptls->mark_queue; + jl_value_t *new_obj; + while (itr > 0) { + size_t bt_size = jl_excstack_bt_size(excstack, itr); + jl_bt_element_t *bt_data = jl_excstack_bt_data(excstack, itr); + for (size_t bt_index = 0; bt_index < bt_size; + bt_index += jl_bt_entry_size(bt_data + bt_index)) { + jl_bt_element_t *bt_entry = bt_data + bt_index; + if (jl_bt_is_native(bt_entry)) + continue; + // Found an extended backtrace entry: iterate over any + // GC-managed values inside. + size_t njlvals = jl_bt_num_jlvals(bt_entry); + for (size_t jlval_index = 0; jlval_index < njlvals; jlval_index++) { + new_obj = jl_bt_entry_jlvalue(bt_entry, jlval_index); + gc_try_claim_and_push(mq, new_obj, NULL); + gc_heap_snapshot_record_frame_to_object_edge(bt_entry, new_obj); } - sp.data = (jl_gc_mark_data_t *)(((char*)sp.data) + sizeof(data)); - sp.pc++; + } + // The exception comes last - mark it + new_obj = jl_excstack_exception(excstack, itr); + itr = jl_excstack_next(excstack, itr); + gc_try_claim_and_push(mq, new_obj, NULL); + gc_heap_snapshot_record_frame_to_object_edge(excstack, new_obj); + } +} + +// Mark module binding +STATIC_INLINE void gc_mark_module_binding(jl_ptls_t ptls, jl_module_t *mb_parent, jl_binding_t **mb_begin, + jl_binding_t **mb_end, uintptr_t nptr, + uint8_t bits) JL_NOTSAFEPOINT +{ + jl_gc_markqueue_t *mq = &ptls->mark_queue; + for (; mb_begin < mb_end; mb_begin += 2) { + jl_binding_t *b = *mb_begin; + if (b == (jl_binding_t *)HT_NOTFOUND) + continue; + if (jl_object_in_image((jl_value_t*)b)) { + jl_taggedvalue_t *buf = jl_astaggedvalue(b); + gc_try_setmark_tag(buf, GC_OLD_MARKED); } else { - gc_mark_push_remset(ptls, (jl_value_t*)m, binding->nptr); + gc_setmark_buf_(ptls, b, bits, sizeof(jl_binding_t)); } - if (scanparent) { - new_obj = (jl_value_t*)m->parent; - goto mark; + void *vb = jl_astaggedvalue(b); + verify_parent1("module", binding->parent, &vb, "binding_buff"); + // Record the size used for the box for non-const bindings + gc_heap_snapshot_record_module_to_binding(mb_parent, b); + (void)vb; + jl_value_t *ty = jl_atomic_load_relaxed(&b->ty); + if (ty && ty != (jl_value_t*)jl_any_type) { + verify_parent2("module", binding->parent, + &b->ty, "binding(%s)", jl_symbol_name(b->name)); + gc_try_claim_and_push(mq, ty, &nptr); } - goto pop; + jl_value_t *value = jl_atomic_load_relaxed(&b->value); + if (value) { + verify_parent2("module", binding->parent, + &b->value, "binding(%s)", jl_symbol_name(b->name)); + gc_try_claim_and_push(mq, value, &nptr); + } + jl_value_t *globalref = jl_atomic_load_relaxed(&b->globalref); + gc_try_claim_and_push(mq, globalref, &nptr); + } + gc_try_claim_and_push(mq, (jl_value_t *)mb_parent->parent, &nptr); + size_t nusings = mb_parent->usings.len; + if (nusings > 0) { + // this is only necessary because bindings for "using" modules + // are added only when accessed. therefore if a module is replaced + // after "using" it but before accessing it, this array might + // contain the only reference. + jl_value_t *obj_parent = (jl_value_t *)mb_parent; + jl_value_t **objary_begin = (jl_value_t **)mb_parent->usings.items; + jl_value_t **objary_end = objary_begin + nusings; + gc_mark_objarray(ptls, obj_parent, objary_begin, objary_end, 1, nptr); + } + else { + gc_mark_push_remset(ptls, (jl_value_t *)mb_parent, nptr); } +} -finlist: { - // Scan a finalizer (or format compatible) list. see `gc_mark_finlist_t` - gc_mark_finlist_t *finlist = gc_pop_markdata(&sp, gc_mark_finlist_t); - jl_value_t **begin = finlist->begin; - jl_value_t **end = finlist->end; - for (; begin < end; begin++) { - new_obj = *begin; - if (__unlikely(!new_obj)) - continue; - if (gc_ptr_tag(new_obj, 1)) { - new_obj = (jl_value_t*)gc_ptr_clear_tag(new_obj, 1); - begin++; - assert(begin < end); - } - if (gc_ptr_tag(new_obj, 2)) - continue; - uintptr_t nptr = 0; - if (!gc_try_setmark(new_obj, &nptr, &tag, &bits)) - continue; - begin++; - // Found an object to mark - if (begin < end) { - // Haven't done with this one yet. Update the content and push it back - finlist->begin = begin; - gc_repush_markdata(&sp, gc_mark_finlist_t); - } - goto mark; +void gc_mark_finlist_(jl_gc_markqueue_t *mq, jl_value_t **fl_begin, jl_value_t **fl_end) +{ + jl_value_t *new_obj; + // Decide whether need to chunk finlist + size_t nrefs = (fl_end - fl_begin); + if (nrefs > GC_CHUNK_BATCH_SIZE) { + jl_gc_chunk_t c = {GC_finlist_chunk, NULL, fl_begin + GC_CHUNK_BATCH_SIZE, fl_end, 0, 0, 0, 0}; + gc_chunkqueue_push(mq, &c); + fl_end = fl_begin + GC_CHUNK_BATCH_SIZE; + } + for (; fl_begin < fl_end; fl_begin++) { + new_obj = *fl_begin; + if (__unlikely(!new_obj)) + continue; + if (gc_ptr_tag(new_obj, 1)) { + new_obj = (jl_value_t *)gc_ptr_clear_tag(new_obj, 1); + fl_begin++; + assert(fl_begin < fl_end); } - goto pop; + if (gc_ptr_tag(new_obj, 2)) + continue; + gc_try_claim_and_push(mq, new_obj, NULL); } +} -mark: { - // Generic scanning entry point. - // Expects `new_obj`, `tag` and `bits` to be set correctly. -#ifdef JL_DEBUG_BUILD +// Mark finalizer list (or list of objects following same format) +void gc_mark_finlist(jl_gc_markqueue_t *mq, arraylist_t *list, size_t start) +{ + size_t len = list->len; + if (len <= start) + return; + jl_value_t **fl_begin = (jl_value_t **)list->items + start; + jl_value_t **fl_end = (jl_value_t **)list->items + len; + gc_mark_finlist_(mq, fl_begin, fl_end); +} + +JL_DLLEXPORT int jl_gc_mark_queue_obj(jl_ptls_t ptls, jl_value_t *obj) +{ + int may_claim = gc_try_setmark_tag(jl_astaggedvalue(obj), GC_MARKED); + if (may_claim) + gc_ptr_queue_push(&ptls->mark_queue, obj); + return may_claim; +} + +JL_DLLEXPORT void jl_gc_mark_queue_objarray(jl_ptls_t ptls, jl_value_t *parent, + jl_value_t **objs, size_t nobjs) +{ + uintptr_t nptr = (nobjs << 2) | (jl_astaggedvalue(parent)->bits.gc & 2); + gc_mark_objarray(ptls, parent, objs, objs + nobjs, 1, nptr); +} + +// Enqueue and mark all outgoing references from `new_obj` which have not been marked +// yet. `meta_updated` is mostly used to make sure we don't update metadata twice for +// objects which have been enqueued into the `remset` +FORCE_INLINE void gc_mark_outrefs(jl_ptls_t ptls, jl_gc_markqueue_t *mq, void *_new_obj, + int meta_updated) +{ + jl_value_t *new_obj = (jl_value_t *)_new_obj; + mark_obj: { + #ifdef JL_DEBUG_BUILD if (new_obj == gc_findval) jl_raise_debugger(); -#endif + #endif jl_taggedvalue_t *o = jl_astaggedvalue(new_obj); - jl_datatype_t *vt = (jl_datatype_t*)tag; - int foreign_alloc = 0; + jl_datatype_t *vt = (jl_datatype_t *)(o->header & ~(uintptr_t)0xf); + uint8_t bits = (gc_old(o->header) && !mark_reset_age) ? GC_OLD_MARKED : GC_MARKED; int update_meta = __likely(!meta_updated && !gc_verifying); - if (update_meta && o->bits.in_image) { + int foreign_alloc = 0; + if (update_meta && jl_object_in_image(new_obj)) { foreign_alloc = 1; update_meta = 0; } - meta_updated = 0; // Symbols are always marked assert(vt != jl_symbol_type); if (vt == jl_simplevector_type) { size_t l = jl_svec_len(new_obj); jl_value_t **data = jl_svec_data(new_obj); - size_t dtsz = l * sizeof(void*) + sizeof(jl_svec_t); + size_t dtsz = l * sizeof(void *) + sizeof(jl_svec_t); if (update_meta) gc_setmark(ptls, o, bits, dtsz); else if (foreign_alloc) objprofile_count(vt, bits == GC_OLD_MARKED, dtsz); + jl_value_t *objary_parent = new_obj; + jl_value_t **objary_begin = data; + jl_value_t **objary_end = data + l; + uint32_t step = 1; uintptr_t nptr = (l << 2) | (bits & GC_OLD); - objary_begin = data; - objary_end = data + l; - gc_mark_objarray_t markdata = {new_obj, objary_begin, objary_end, 1, nptr}; - gc_mark_stack_push(&ptls->gc_cache, &sp, gc_mark_laddr(objarray), - &markdata, sizeof(markdata), 0); - objary = (gc_mark_objarray_t*)sp.data; - goto objarray_loaded; + gc_mark_objarray(ptls, objary_parent, objary_begin, objary_end, step, nptr); } else if (vt->name == jl_array_typename) { - jl_array_t *a = (jl_array_t*)new_obj; + jl_array_t *a = (jl_array_t *)new_obj; jl_array_flags_t flags = a->flags; if (update_meta) { if (flags.pooled) @@ -2849,9 +2556,10 @@ mark: { else gc_setmark_big(ptls, o, bits); } - else if (foreign_alloc) + else if (foreign_alloc) { objprofile_count(vt, bits == GC_OLD_MARKED, sizeof(jl_array_t)); - if (flags.how ==0){ + } + if (flags.how == 0) { void *data_ptr = (char*)a + sizeof(jl_array_t) +jl_array_ndimwords(a->flags.ndims) * sizeof(size_t); gc_heap_snapshot_record_hidden_edge(new_obj, data_ptr, jl_array_nbytes(a), 2); } @@ -2879,102 +2587,81 @@ mark: { else if (flags.how == 3) { jl_value_t *owner = jl_array_data_owner(a); uintptr_t nptr = (1 << 2) | (bits & GC_OLD); + gc_try_claim_and_push(mq, owner, &nptr); gc_heap_snapshot_record_internal_array_edge(new_obj, owner); - int markowner = gc_try_setmark(owner, &nptr, &tag, &bits); gc_mark_push_remset(ptls, new_obj, nptr); - if (markowner) { - new_obj = owner; - goto mark; - } - goto pop; + return; } - if (a->data == NULL || jl_array_len(a) == 0) - goto pop; + if (!a->data || jl_array_len(a) == 0) + return; if (flags.ptrarray) { - if ((jl_datatype_t*)jl_tparam0(vt) == jl_symbol_type) - goto pop; + if ((jl_datatype_t *)jl_tparam0(vt) == jl_symbol_type) + return; size_t l = jl_array_len(a); + jl_value_t *objary_parent = new_obj; + jl_value_t **objary_begin = (jl_value_t **)a->data; + jl_value_t **objary_end = objary_begin + l; + uint32_t step = 1; uintptr_t nptr = (l << 2) | (bits & GC_OLD); - objary_begin = (jl_value_t**)a->data; - objary_end = objary_begin + l; - gc_mark_objarray_t markdata = {new_obj, objary_begin, objary_end, 1, nptr}; - gc_mark_stack_push(&ptls->gc_cache, &sp, gc_mark_laddr(objarray), - &markdata, sizeof(markdata), 0); - objary = (gc_mark_objarray_t*)sp.data; - goto objarray_loaded; + gc_mark_objarray(ptls, objary_parent, objary_begin, objary_end, step, nptr); } else if (flags.hasptr) { - jl_datatype_t *et = (jl_datatype_t*)jl_tparam0(vt); + jl_datatype_t *et = (jl_datatype_t *)jl_tparam0(vt); const jl_datatype_layout_t *layout = et->layout; unsigned npointers = layout->npointers; - unsigned elsize = a->elsize / sizeof(jl_value_t*); + unsigned elsize = a->elsize / sizeof(jl_value_t *); size_t l = jl_array_len(a); + jl_value_t *objary_parent = new_obj; + jl_value_t **objary_begin = (jl_value_t **)a->data; + jl_value_t **objary_end = objary_begin + l * elsize; + uint32_t step = elsize; uintptr_t nptr = ((l * npointers) << 2) | (bits & GC_OLD); - objary_begin = (jl_value_t**)a->data; - objary_end = objary_begin + l * elsize; if (npointers == 1) { // TODO: detect anytime time stride is uniform? objary_begin += layout->first_ptr; - gc_mark_objarray_t markdata = {new_obj, objary_begin, objary_end, elsize, nptr}; - gc_mark_stack_push(&ptls->gc_cache, &sp, gc_mark_laddr(objarray), - &markdata, sizeof(markdata), 0); - objary = (gc_mark_objarray_t*)sp.data; - goto objarray_loaded; + gc_mark_objarray(ptls, objary_parent, objary_begin, objary_end, step, nptr); } else if (layout->fielddesc_type == 0) { - obj8_begin = (uint8_t*)jl_dt_layout_ptrs(layout); - obj8_end = obj8_begin + npointers; - gc_mark_array8_t markdata = {objary_begin, objary_end, obj8_begin, {new_obj, obj8_begin, obj8_end, nptr}}; - gc_mark_stack_push(&ptls->gc_cache, &sp, gc_mark_laddr(array8), - &markdata, sizeof(markdata), 0); - ary8 = (gc_mark_array8_t*)sp.data; - goto array8_loaded; + uint8_t *obj8_begin = (uint8_t *)jl_dt_layout_ptrs(layout); + uint8_t *obj8_end = obj8_begin + npointers; + gc_mark_array8(ptls, objary_parent, objary_begin, objary_end, obj8_begin, + obj8_end, nptr); } else if (layout->fielddesc_type == 1) { - obj16_begin = (uint16_t*)jl_dt_layout_ptrs(layout); - obj16_end = obj16_begin + npointers; - gc_mark_array16_t markdata = {objary_begin, objary_end, obj16_begin, {new_obj, obj16_begin, obj16_end, nptr}}; - gc_mark_stack_push(&ptls->gc_cache, &sp, gc_mark_laddr(array16), - &markdata, sizeof(markdata), 0); - ary16 = (gc_mark_array16_t*)sp.data; - goto array16_loaded; + uint16_t *obj16_begin = (uint16_t *)jl_dt_layout_ptrs(layout); + uint16_t *obj16_end = obj16_begin + npointers; + gc_mark_array16(ptls, objary_parent, objary_begin, objary_end, obj16_begin, + obj16_end, nptr); } else { assert(0 && "unimplemented"); } } - goto pop; } else if (vt == jl_module_type) { if (update_meta) gc_setmark(ptls, o, bits, sizeof(jl_module_t)); else if (foreign_alloc) objprofile_count(vt, bits == GC_OLD_MARKED, sizeof(jl_module_t)); - jl_module_t *m = (jl_module_t*)new_obj; - jl_binding_t **table = (jl_binding_t**)m->bindings.table; - size_t bsize = m->bindings.size; - uintptr_t nptr = ((bsize + m->usings.len + 1) << 2) | (bits & GC_OLD); - gc_mark_binding_t markdata = {m, table + 1, table + bsize, nptr, bits}; - gc_mark_stack_push(&ptls->gc_cache, &sp, gc_mark_laddr(module_binding), - &markdata, sizeof(markdata), 0); - sp.data = (jl_gc_mark_data_t *)(((char*)sp.data) + sizeof(markdata)); - goto module_binding; + jl_module_t *mb_parent = (jl_module_t *)new_obj; + jl_binding_t **mb_begin = (jl_binding_t **)mb_parent->bindings.table + 1; + size_t bsize = mb_parent->bindings.size; + jl_binding_t **mb_end = (jl_binding_t **)mb_parent->bindings.table + bsize; + uintptr_t nptr = ((bsize + mb_parent->usings.len + 1) << 2) | (bits & GC_OLD); + gc_mark_module_binding(ptls, mb_parent, mb_begin, mb_end, nptr, bits); } else if (vt == jl_task_type) { if (update_meta) gc_setmark(ptls, o, bits, sizeof(jl_task_t)); else if (foreign_alloc) objprofile_count(vt, bits == GC_OLD_MARKED, sizeof(jl_task_t)); - jl_task_t *ta = (jl_task_t*)new_obj; + jl_task_t *ta = (jl_task_t *)new_obj; gc_scrub_record_task(ta); if (gc_cblist_task_scanner) { - export_gc_state(ptls, &sp); int16_t tid = jl_atomic_load_relaxed(&ta->tid); - gc_invoke_callbacks(jl_gc_cb_task_scanner_t, - gc_cblist_task_scanner, - (ta, tid != -1 && ta == gc_all_tls_states[tid]->root_task)); - import_gc_state(ptls, &sp); + gc_invoke_callbacks(jl_gc_cb_task_scanner_t, gc_cblist_task_scanner, + (ta, tid != -1 && ta == gc_all_tls_states[tid]->root_task)); } -#ifdef COPY_STACKS + #ifdef COPY_STACKS void *stkbuf = ta->stkbuf; if (stkbuf && ta->copy_stack) { gc_setmark_buf_(ptls, stkbuf, bits, ta->bufsz); @@ -2983,14 +2670,14 @@ mark: { // TODO: edge to stack data // TODO: synthetic node for stack data (how big is it?) } -#endif + #endif jl_gcframe_t *s = ta->gcstack; size_t nroots; uintptr_t offset = 0; uintptr_t lb = 0; uintptr_t ub = (uintptr_t)-1; -#ifdef COPY_STACKS - if (stkbuf && ta->copy_stack && ta->ptls == NULL) { + #ifdef COPY_STACKS + if (stkbuf && ta->copy_stack && !ta->ptls) { int16_t tid = jl_atomic_load_relaxed(&ta->tid); assert(tid >= 0); jl_ptls_t ptls2 = gc_all_tls_states[tid]; @@ -2998,38 +2685,38 @@ mark: { lb = ub - ta->copy_stack; offset = (uintptr_t)stkbuf - lb; } -#endif - if (s) { + #endif + if (s != NULL) { nroots = gc_read_stack(&s->nroots, offset, lb, ub); gc_heap_snapshot_record_task_to_frame_edge(ta, s); - assert(nroots <= UINT32_MAX); - gc_mark_stackframe_t stackdata = {s, 0, (uint32_t)nroots, offset, lb, ub}; - gc_mark_stack_push(&ptls->gc_cache, &sp, gc_mark_laddr(stack), - &stackdata, sizeof(stackdata), 1); + gc_mark_stack(ptls, s, (uint32_t)nroots, offset, lb, ub); } if (ta->excstack) { - gc_heap_snapshot_record_task_to_frame_edge(ta, ta->excstack); - gc_setmark_buf_(ptls, ta->excstack, bits, sizeof(jl_excstack_t) + - sizeof(uintptr_t)*ta->excstack->reserved_size); - gc_mark_excstack_t stackdata = {ta->excstack, ta->excstack->top, 0, 0}; - gc_mark_stack_push(&ptls->gc_cache, &sp, gc_mark_laddr(excstack), - &stackdata, sizeof(stackdata), 1); + jl_excstack_t *excstack = ta->excstack; + gc_heap_snapshot_record_task_to_frame_edge(ta, excstack); + size_t itr = ta->excstack->top; + gc_setmark_buf_(ptls, excstack, bits, + sizeof(jl_excstack_t) + + sizeof(uintptr_t) * excstack->reserved_size); + gc_mark_excstack(ptls, excstack, itr); } const jl_datatype_layout_t *layout = jl_task_type->layout; assert(layout->fielddesc_type == 0); assert(layout->nfields > 0); uint32_t npointers = layout->npointers; - obj8_begin = (uint8_t*)jl_dt_layout_ptrs(layout); - obj8_end = obj8_begin + npointers; + char *obj8_parent = (char *)ta; + uint8_t *obj8_begin = (uint8_t *)jl_dt_layout_ptrs(layout); + uint8_t *obj8_end = obj8_begin + npointers; // assume tasks always reference young objects: set lowest bit uintptr_t nptr = (npointers << 2) | 1 | bits; - gc_mark_obj8_t markdata = {new_obj, obj8_begin, obj8_end, nptr}; - gc_mark_stack_push(&ptls->gc_cache, &sp, gc_mark_laddr(obj8), - &markdata, sizeof(markdata), 0); - obj8 = (gc_mark_obj8_t*)sp.data; - obj8_parent = (char*)ta; - goto obj8_loaded; + new_obj = gc_mark_obj8(ptls, obj8_parent, obj8_begin, obj8_end, nptr); + if (new_obj != NULL) { + if (!meta_updated) + goto mark_obj; + else + gc_ptr_queue_push(mq, new_obj); + } } else if (vt == jl_string_type) { size_t dtsz = jl_string_len(new_obj) + sizeof(size_t) + 1; @@ -3037,140 +2724,388 @@ mark: { gc_setmark(ptls, o, bits, dtsz); else if (foreign_alloc) objprofile_count(vt, bits == GC_OLD_MARKED, dtsz); - goto pop; } else { if (__unlikely(!jl_is_datatype(vt))) - gc_assert_datatype_fail(ptls, vt, sp); + gc_assert_datatype_fail(ptls, vt, mq); size_t dtsz = jl_datatype_size(vt); if (update_meta) gc_setmark(ptls, o, bits, dtsz); else if (foreign_alloc) objprofile_count(vt, bits == GC_OLD_MARKED, dtsz); if (vt == jl_weakref_type) - goto pop; + return; const jl_datatype_layout_t *layout = vt->layout; uint32_t npointers = layout->npointers; if (npointers == 0) - goto pop; - uintptr_t nptr = npointers << 2 | (bits & GC_OLD); - assert((layout->nfields > 0 || layout->fielddesc_type == 3) && "opaque types should have been handled specially"); + return; + uintptr_t nptr = (npointers << 2 | (bits & GC_OLD)); + assert((layout->nfields > 0 || layout->fielddesc_type == 3) && + "opaque types should have been handled specially"); if (layout->fielddesc_type == 0) { - obj8_parent = (char*)new_obj; - obj8_begin = (uint8_t*)jl_dt_layout_ptrs(layout); - obj8_end = obj8_begin + npointers; + char *obj8_parent = (char *)new_obj; + uint8_t *obj8_begin = (uint8_t *)jl_dt_layout_ptrs(layout); + uint8_t *obj8_end = obj8_begin + npointers; assert(obj8_begin < obj8_end); - gc_mark_obj8_t markdata = {new_obj, obj8_begin, obj8_end, nptr}; - gc_mark_stack_push(&ptls->gc_cache, &sp, gc_mark_laddr(obj8), - &markdata, sizeof(markdata), 0); - obj8 = (gc_mark_obj8_t*)sp.data; - goto obj8_loaded; + new_obj = gc_mark_obj8(ptls, obj8_parent, obj8_begin, obj8_end, nptr); + if (new_obj != NULL) { + if (!meta_updated) + goto mark_obj; + else + gc_ptr_queue_push(mq, new_obj); + } } else if (layout->fielddesc_type == 1) { - obj16_parent = (char*)new_obj; - obj16_begin = (uint16_t*)jl_dt_layout_ptrs(layout); - obj16_end = obj16_begin + npointers; + char *obj16_parent = (char *)new_obj; + uint16_t *obj16_begin = (uint16_t *)jl_dt_layout_ptrs(layout); + uint16_t *obj16_end = obj16_begin + npointers; assert(obj16_begin < obj16_end); - gc_mark_obj16_t markdata = {new_obj, obj16_begin, obj16_end, nptr}; - gc_mark_stack_push(&ptls->gc_cache, &sp, gc_mark_laddr(obj16), - &markdata, sizeof(markdata), 0); - obj16 = (gc_mark_obj16_t*)sp.data; - goto obj16_loaded; + new_obj = gc_mark_obj16(ptls, obj16_parent, obj16_begin, obj16_end, nptr); + if (new_obj != NULL) { + if (!meta_updated) + goto mark_obj; + else + gc_ptr_queue_push(mq, new_obj); + } } else if (layout->fielddesc_type == 2) { // This is very uncommon // Do not do store to load forwarding to save some code size - uint32_t *obj32_begin = (uint32_t*)jl_dt_layout_ptrs(layout); + char *obj32_parent = (char *)new_obj; + uint32_t *obj32_begin = (uint32_t *)jl_dt_layout_ptrs(layout); uint32_t *obj32_end = obj32_begin + npointers; - gc_mark_obj32_t markdata = {new_obj, obj32_begin, obj32_end, nptr}; - gc_mark_stack_push(&ptls->gc_cache, &sp, gc_mark_laddr(obj32), - &markdata, sizeof(markdata), 0); - sp.data = (jl_gc_mark_data_t *)(((char*)sp.data) + sizeof(markdata)); - goto obj32; + assert(obj32_begin < obj32_end); + new_obj = gc_mark_obj32(ptls, obj32_parent, obj32_begin, obj32_end, nptr); + if (new_obj != NULL) { + if (!meta_updated) + goto mark_obj; + else + gc_ptr_queue_push(mq, new_obj); + } } else { assert(layout->fielddesc_type == 3); - jl_fielddescdyn_t *desc = (jl_fielddescdyn_t*)jl_dt_layout_fields(layout); + jl_fielddescdyn_t *desc = (jl_fielddescdyn_t *)jl_dt_layout_fields(layout); int old = jl_astaggedvalue(new_obj)->bits.gc & 2; - export_gc_state(ptls, &sp); uintptr_t young = desc->markfunc(ptls, new_obj); - import_gc_state(ptls, &sp); if (old && young) gc_mark_push_remset(ptls, new_obj, young * 4 + 3); + } + } + } +} + +// Used in gc-debug +void gc_mark_loop_serial_(jl_ptls_t ptls, jl_gc_markqueue_t *mq) +{ + while (1) { + void *new_obj = (void *)gc_ptr_queue_pop(&ptls->mark_queue); + // No more objects to mark + if (__unlikely(new_obj == NULL)) { + return; + } + gc_mark_outrefs(ptls, mq, new_obj, 0); + } +} + +// Drain items from worker's own chunkqueue +void gc_drain_own_chunkqueue(jl_ptls_t ptls, jl_gc_markqueue_t *mq) +{ + jl_gc_chunk_t c = {.cid = GC_empty_chunk}; + do { + c = gc_chunkqueue_pop(mq); + if (c.cid != GC_empty_chunk) { + gc_mark_chunk(ptls, mq, &c); + gc_mark_loop_serial_(ptls, mq); + } + } while (c.cid != GC_empty_chunk); +} + +// Main mark loop. Stack (allocated on the heap) of `jl_value_t *` +// is used to keep track of processed items. Maintaning this stack (instead of +// native one) avoids stack overflow when marking deep objects and +// makes it easier to implement parallel marking via work-stealing +JL_EXTENSION NOINLINE void gc_mark_loop_serial(jl_ptls_t ptls) +{ + gc_mark_loop_serial_(ptls, &ptls->mark_queue); + gc_drain_own_chunkqueue(ptls, &ptls->mark_queue); +} + +void gc_mark_and_steal(jl_ptls_t ptls) +{ + jl_gc_markqueue_t *mq = &ptls->mark_queue; + jl_gc_markqueue_t *mq_master = NULL; + int master_tid = jl_atomic_load(&gc_master_tid); + if (master_tid != -1) + mq_master = &gc_all_tls_states[master_tid]->mark_queue; + void *new_obj; + jl_gc_chunk_t c; + pop : { + new_obj = gc_ptr_queue_pop(mq); + if (new_obj != NULL) { + goto mark; + } + c = gc_chunkqueue_pop(mq); + if (c.cid != GC_empty_chunk) { + gc_mark_chunk(ptls, mq, &c); + goto pop; + } + goto steal; + } + mark : { + gc_mark_outrefs(ptls, mq, new_obj, 0); + goto pop; + } + // Note that for the stealing heuristics, we try to + // steal chunks much more agressively than pointers, + // since we know chunks will likely expand into a lot + // of work for the mark loop + steal : { + // Try to steal chunk from random GC thread + for (int i = 0; i < 4 * jl_n_gcthreads; i++) { + uint32_t v = gc_first_tid + cong(UINT64_MAX, UINT64_MAX, &ptls->rngseed) % jl_n_gcthreads; + jl_gc_markqueue_t *mq2 = &gc_all_tls_states[v]->mark_queue; + c = gc_chunkqueue_steal_from(mq2); + if (c.cid != GC_empty_chunk) { + gc_mark_chunk(ptls, mq, &c); + goto pop; + } + } + // Sequentially walk GC threads to try to steal chunk + for (int i = gc_first_tid; i < gc_first_tid + jl_n_gcthreads; i++) { + jl_gc_markqueue_t *mq2 = &gc_all_tls_states[i]->mark_queue; + c = gc_chunkqueue_steal_from(mq2); + if (c.cid != GC_empty_chunk) { + gc_mark_chunk(ptls, mq, &c); goto pop; } } + // Try to steal chunk from master thread + if (mq_master != NULL) { + c = gc_chunkqueue_steal_from(mq_master); + if (c.cid != GC_empty_chunk) { + gc_mark_chunk(ptls, mq, &c); + goto pop; + } + } + // Try to steal pointer from random GC thread + for (int i = 0; i < 4 * jl_n_gcthreads; i++) { + uint32_t v = gc_first_tid + cong(UINT64_MAX, UINT64_MAX, &ptls->rngseed) % jl_n_gcthreads; + jl_gc_markqueue_t *mq2 = &gc_all_tls_states[v]->mark_queue; + new_obj = gc_ptr_queue_steal_from(mq2); + if (new_obj != NULL) + goto mark; + } + // Sequentially walk GC threads to try to steal pointer + for (int i = gc_first_tid; i < gc_first_tid + jl_n_gcthreads; i++) { + jl_gc_markqueue_t *mq2 = &gc_all_tls_states[i]->mark_queue; + new_obj = gc_ptr_queue_steal_from(mq2); + if (new_obj != NULL) + goto mark; + } + // Try to steal pointer from master thread + if (mq_master != NULL) { + new_obj = gc_ptr_queue_steal_from(mq_master); + if (new_obj != NULL) + goto mark; + } + } +} + +#define GC_BACKOFF_MIN 4 +#define GC_BACKOFF_MAX 12 + +void gc_mark_backoff(int *i) +{ + if (*i < GC_BACKOFF_MAX) { + (*i)++; + } + for (int j = 0; j < (1 << *i); j++) { + jl_cpu_pause(); + } +} + +void gc_mark_loop_parallel(jl_ptls_t ptls, int master) +{ + int backoff = GC_BACKOFF_MIN; + if (master) { + jl_atomic_store(&gc_master_tid, ptls->tid); + // Wake threads up and try to do some work + uv_mutex_lock(&gc_threads_lock); + jl_atomic_fetch_add(&gc_n_threads_marking, 1); + uv_cond_broadcast(&gc_threads_cond); + uv_mutex_unlock(&gc_threads_lock); + gc_mark_and_steal(ptls); + jl_atomic_fetch_add(&gc_n_threads_marking, -1); + } + while (jl_atomic_load(&gc_n_threads_marking) > 0) { + // Try to become a thief while other threads are marking + jl_atomic_fetch_add(&gc_n_threads_marking, 1); + if (jl_atomic_load(&gc_master_tid) != -1) { + gc_mark_and_steal(ptls); + } + jl_atomic_fetch_add(&gc_n_threads_marking, -1); + // Failed to steal + gc_mark_backoff(&backoff); + } +} + +void gc_mark_loop(jl_ptls_t ptls) +{ + if (jl_n_gcthreads == 0 || gc_heap_snapshot_enabled) { + gc_mark_loop_serial(ptls); + } + else { + gc_mark_loop_parallel(ptls, 1); + } +} + +void gc_mark_loop_barrier(void) +{ + jl_atomic_store(&gc_master_tid, -1); + while (jl_atomic_load(&gc_n_threads_marking) != 0) { + jl_cpu_pause(); + } +} + +void gc_mark_clean_reclaim_sets(void) +{ + // Clean up `reclaim-sets` + for (int i = 0; i < gc_n_threads; i++) { + jl_ptls_t ptls2 = gc_all_tls_states[i]; + arraylist_t *reclaim_set2 = &ptls2->mark_queue.reclaim_set; + ws_array_t *a = NULL; + while ((a = (ws_array_t *)arraylist_pop(reclaim_set2)) != NULL) { + free(a->buffer); + free(a); + } + } +} + +static void gc_premark(jl_ptls_t ptls2) +{ + arraylist_t *remset = ptls2->heap.remset; + ptls2->heap.remset = ptls2->heap.last_remset; + ptls2->heap.last_remset = remset; + ptls2->heap.remset->len = 0; + ptls2->heap.remset_nptr = 0; + // avoid counting remembered objects & bindings twice + // in `perm_scanned_bytes` + size_t len = remset->len; + void **items = remset->items; + for (size_t i = 0; i < len; i++) { + jl_value_t *item = (jl_value_t*)items[i]; + objprofile_count(jl_typeof(item), 2, 0); + jl_astaggedvalue(item)->bits.gc = GC_OLD_MARKED; + } + len = ptls2->heap.rem_bindings.len; + items = ptls2->heap.rem_bindings.items; + for (size_t i = 0; i < len; i++) { + void *ptr = items[i]; + jl_astaggedvalue(ptr)->bits.gc = GC_OLD_MARKED; } } -static void jl_gc_queue_thread_local(jl_gc_mark_cache_t *gc_cache, jl_gc_mark_sp_t *sp, - jl_ptls_t ptls2) +static void gc_queue_thread_local(jl_gc_markqueue_t *mq, jl_ptls_t ptls2) { jl_task_t *task; task = ptls2->root_task; - if (task) { - gc_mark_queue_obj(gc_cache, sp, task); + if (task != NULL) { + gc_try_claim_and_push(mq, task, NULL); gc_heap_snapshot_record_root((jl_value_t*)task, "root task"); } task = jl_atomic_load_relaxed(&ptls2->current_task); - if (task) { - gc_mark_queue_obj(gc_cache, sp, task); + if (task != NULL) { + gc_try_claim_and_push(mq, task, NULL); gc_heap_snapshot_record_root((jl_value_t*)task, "current task"); } task = ptls2->next_task; - if (task) { - gc_mark_queue_obj(gc_cache, sp, task); + if (task != NULL) { + gc_try_claim_and_push(mq, task, NULL); gc_heap_snapshot_record_root((jl_value_t*)task, "next task"); } task = ptls2->previous_task; - if (task) { // shouldn't be necessary, but no reason not to - gc_mark_queue_obj(gc_cache, sp, task); + if (task != NULL) { + gc_try_claim_and_push(mq, task, NULL); gc_heap_snapshot_record_root((jl_value_t*)task, "previous task"); } if (ptls2->previous_exception) { - gc_mark_queue_obj(gc_cache, sp, ptls2->previous_exception); + gc_try_claim_and_push(mq, ptls2->previous_exception, NULL); gc_heap_snapshot_record_root((jl_value_t*)ptls2->previous_exception, "previous exception"); } } +static void gc_queue_bt_buf(jl_gc_markqueue_t *mq, jl_ptls_t ptls2) +{ + jl_bt_element_t *bt_data = ptls2->bt_data; + size_t bt_size = ptls2->bt_size; + for (size_t i = 0; i < bt_size; i += jl_bt_entry_size(bt_data + i)) { + jl_bt_element_t *bt_entry = bt_data + i; + if (jl_bt_is_native(bt_entry)) + continue; + size_t njlvals = jl_bt_num_jlvals(bt_entry); + for (size_t j = 0; j < njlvals; j++) + gc_try_claim_and_push(mq, jl_bt_entry_jlvalue(bt_entry, j), NULL); + } +} + +static void gc_queue_remset(jl_ptls_t ptls, jl_ptls_t ptls2) +{ + size_t len = ptls2->heap.last_remset->len; + void **items = ptls2->heap.last_remset->items; + for (size_t i = 0; i < len; i++) { + // Objects in the `remset` are already marked, + // so a `gc_try_claim_and_push` wouldn't work here + gc_mark_outrefs(ptls, &ptls->mark_queue, (jl_value_t *)items[i], 1); + } + int n_bnd_refyoung = 0; + len = ptls2->heap.rem_bindings.len; + items = ptls2->heap.rem_bindings.items; + for (size_t i = 0; i < len; i++) { + jl_binding_t *ptr = (jl_binding_t*)items[i]; + uintptr_t bnd_refyoung = 0; + jl_value_t *v = jl_atomic_load_relaxed(&ptr->value); + gc_try_claim_and_push(&ptls->mark_queue, v, &bnd_refyoung); + jl_value_t *ty = jl_atomic_load_relaxed(&ptr->ty); + gc_try_claim_and_push(&ptls->mark_queue, ty, &bnd_refyoung); + jl_value_t *globalref = jl_atomic_load_relaxed(&ptr->globalref); + gc_try_claim_and_push(&ptls->mark_queue, globalref, &bnd_refyoung); + if (bnd_refyoung) { + items[n_bnd_refyoung] = ptr; + n_bnd_refyoung++; + } + } + ptls2->heap.rem_bindings.len = n_bnd_refyoung; +} + extern jl_value_t *cmpswap_names JL_GLOBALLY_ROOTED; // mark the initial root set -static void mark_roots(jl_gc_mark_cache_t *gc_cache, jl_gc_mark_sp_t *sp) +static void gc_mark_roots(jl_gc_markqueue_t *mq) { // modules - gc_mark_queue_obj(gc_cache, sp, jl_main_module); + gc_try_claim_and_push(mq, jl_main_module, NULL); gc_heap_snapshot_record_root((jl_value_t*)jl_main_module, "main_module"); - // invisible builtin values - if (jl_an_empty_vec_any != NULL) - gc_mark_queue_obj(gc_cache, sp, jl_an_empty_vec_any); - if (jl_module_init_order != NULL) - gc_mark_queue_obj(gc_cache, sp, jl_module_init_order); + gc_try_claim_and_push(mq, jl_an_empty_vec_any, NULL); + gc_try_claim_and_push(mq, jl_module_init_order, NULL); for (size_t i = 0; i < jl_current_modules.size; i += 2) { if (jl_current_modules.table[i + 1] != HT_NOTFOUND) { - gc_mark_queue_obj(gc_cache, sp, jl_current_modules.table[i]); + gc_try_claim_and_push(mq, jl_current_modules.table[i], NULL); gc_heap_snapshot_record_root((jl_value_t*)jl_current_modules.table[i], "top level module"); } } - gc_mark_queue_obj(gc_cache, sp, jl_anytuple_type_type); + gc_try_claim_and_push(mq, jl_anytuple_type_type, NULL); for (size_t i = 0; i < N_CALL_CACHE; i++) { jl_typemap_entry_t *v = jl_atomic_load_relaxed(&call_cache[i]); - if (v != NULL) { - gc_mark_queue_obj(gc_cache, sp, v); - } + gc_try_claim_and_push(mq, v, NULL); } - if (jl_all_methods != NULL) { - gc_mark_queue_obj(gc_cache, sp, jl_all_methods); - } - if (_jl_debug_method_invalidation != NULL) - gc_mark_queue_obj(gc_cache, sp, _jl_debug_method_invalidation); - + gc_try_claim_and_push(mq, jl_all_methods, NULL); + gc_try_claim_and_push(mq, _jl_debug_method_invalidation, NULL); // constants - gc_mark_queue_obj(gc_cache, sp, jl_emptytuple_type); - if (cmpswap_names != NULL) - gc_mark_queue_obj(gc_cache, sp, cmpswap_names); - gc_mark_queue_obj(gc_cache, sp, jl_global_roots_table); + gc_try_claim_and_push(mq, jl_emptytuple_type, NULL); + gc_try_claim_and_push(mq, cmpswap_names, NULL); + gc_try_claim_and_push(mq, jl_global_roots_table, NULL); } // find unmarked objects that need to be finalized from the finalizer list "list". @@ -3310,76 +3245,6 @@ JL_DLLEXPORT int64_t jl_gc_live_bytes(void) return live_bytes; } -static void jl_gc_premark(jl_ptls_t ptls2) -{ - arraylist_t *remset = ptls2->heap.remset; - ptls2->heap.remset = ptls2->heap.last_remset; - ptls2->heap.last_remset = remset; - ptls2->heap.remset->len = 0; - ptls2->heap.remset_nptr = 0; - - // avoid counting remembered objects & bindings twice - // in `perm_scanned_bytes` - size_t len = remset->len; - void **items = remset->items; - for (size_t i = 0; i < len; i++) { - jl_value_t *item = (jl_value_t*)items[i]; - objprofile_count(jl_typeof(item), 2, 0); - jl_astaggedvalue(item)->bits.gc = GC_OLD_MARKED; - } - len = ptls2->heap.rem_bindings.len; - items = ptls2->heap.rem_bindings.items; - for (size_t i = 0; i < len; i++) { - void *ptr = items[i]; - jl_astaggedvalue(ptr)->bits.gc = GC_OLD_MARKED; - } -} - -static void jl_gc_queue_remset(jl_gc_mark_cache_t *gc_cache, jl_gc_mark_sp_t *sp, jl_ptls_t ptls2) -{ - size_t len = ptls2->heap.last_remset->len; - void **items = ptls2->heap.last_remset->items; - for (size_t i = 0; i < len; i++) - gc_mark_queue_scan_obj(gc_cache, sp, (jl_value_t*)items[i]); - int n_bnd_refyoung = 0; - len = ptls2->heap.rem_bindings.len; - items = ptls2->heap.rem_bindings.items; - for (size_t i = 0; i < len; i++) { - jl_binding_t *ptr = (jl_binding_t*)items[i]; - // A null pointer can happen here when the binding is cleaned up - // as an exception is thrown after it was already queued (#10221) - int bnd_refyoung = 0; - jl_value_t *v = jl_atomic_load_relaxed(&ptr->value); - if (v != NULL && gc_mark_queue_obj(gc_cache, sp, v)) - bnd_refyoung = 1; - jl_value_t *ty = jl_atomic_load_relaxed(&ptr->ty); - if (ty != NULL && gc_mark_queue_obj(gc_cache, sp, ty)) - bnd_refyoung = 1; - jl_value_t *globalref = jl_atomic_load_relaxed(&ptr->globalref); - if (globalref != NULL && gc_mark_queue_obj(gc_cache, sp, globalref)) - bnd_refyoung = 1; - if (bnd_refyoung) { - items[n_bnd_refyoung] = ptr; - n_bnd_refyoung++; - } - } - ptls2->heap.rem_bindings.len = n_bnd_refyoung; -} - -static void jl_gc_queue_bt_buf(jl_gc_mark_cache_t *gc_cache, jl_gc_mark_sp_t *sp, jl_ptls_t ptls2) -{ - jl_bt_element_t *bt_data = ptls2->bt_data; - size_t bt_size = ptls2->bt_size; - for (size_t i = 0; i < bt_size; i += jl_bt_entry_size(bt_data + i)) { - jl_bt_element_t *bt_entry = bt_data + i; - if (jl_bt_is_native(bt_entry)) - continue; - size_t njlvals = jl_bt_num_jlvals(bt_entry); - for (size_t j = 0; j < njlvals; j++) - gc_mark_queue_obj(gc_cache, sp, jl_bt_entry_jlvalue(bt_entry, j)); - } -} - size_t jl_maxrss(void); // Only one thread should be running in this function @@ -3387,9 +3252,7 @@ static int _jl_gc_collect(jl_ptls_t ptls, jl_gc_collection_t collection) { combine_thread_gc_counts(&gc_num); - jl_gc_mark_cache_t *gc_cache = &ptls->gc_cache; - jl_gc_mark_sp_t sp; - gc_mark_sp_init(gc_cache, &sp); + jl_gc_markqueue_t *mq = &ptls->mark_queue; uint64_t gc_start_time = jl_hrtime(); int64_t last_perm_scanned_bytes = perm_scanned_bytes; @@ -3401,33 +3264,39 @@ static int _jl_gc_collect(jl_ptls_t ptls, jl_gc_collection_t collection) for (int t_i = 0; t_i < gc_n_threads; t_i++) { jl_ptls_t ptls2 = gc_all_tls_states[t_i]; if (ptls2 != NULL) - jl_gc_premark(ptls2); + gc_premark(ptls2); } assert(gc_n_threads); + int single_threaded = (jl_n_gcthreads == 0 || gc_heap_snapshot_enabled); for (int t_i = 0; t_i < gc_n_threads; t_i++) { jl_ptls_t ptls2 = gc_all_tls_states[t_i]; - if (ptls2 == NULL) - continue; - // 2.1. mark every object in the `last_remsets` and `rem_binding` - jl_gc_queue_remset(gc_cache, &sp, ptls2); - // 2.2. mark every thread local root - jl_gc_queue_thread_local(gc_cache, &sp, ptls2); - // 2.3. mark any managed objects in the backtrace buffer - // TODO: treat these as roots for gc_heap_snapshot_record - jl_gc_queue_bt_buf(gc_cache, &sp, ptls2); + jl_gc_markqueue_t *mq2 = mq; + jl_ptls_t ptls_gc_thread = NULL; + if (!single_threaded) { + ptls_gc_thread = gc_all_tls_states[gc_first_tid + t_i % jl_n_gcthreads]; + mq2 = &ptls_gc_thread->mark_queue; + } + if (ptls2 != NULL) { + // 2.1. mark every thread local root + gc_queue_thread_local(mq2, ptls2); + // 2.2. mark any managed objects in the backtrace buffer + // TODO: treat these as roots for gc_heap_snapshot_record + gc_queue_bt_buf(mq2, ptls2); + // 2.3. mark every object in the `last_remsets` and `rem_binding` + gc_queue_remset(single_threaded ? ptls : ptls_gc_thread, ptls2); + } } // 3. walk roots - mark_roots(gc_cache, &sp); + gc_mark_roots(mq); if (gc_cblist_root_scanner) { - export_gc_state(ptls, &sp); gc_invoke_callbacks(jl_gc_cb_root_scanner_t, gc_cblist_root_scanner, (collection)); - import_gc_state(ptls, &sp); } - gc_mark_loop(ptls, sp); - gc_mark_sp_init(gc_cache, &sp); + gc_mark_loop(ptls); + gc_mark_loop_barrier(); + gc_mark_clean_reclaim_sets(); gc_num.since_sweep += gc_num.allocd; JL_PROBE_GC_MARK_END(scanned_bytes, perm_scanned_bytes); gc_settime_premark_end(); @@ -3448,9 +3317,8 @@ static int _jl_gc_collect(jl_ptls_t ptls, jl_gc_collection_t collection) assert(gc_n_threads); for (int i = 0; i < gc_n_threads; i++) { jl_ptls_t ptls2 = gc_all_tls_states[i]; - if (ptls2 == NULL) - continue; - sweep_finalizer_list(&ptls2->finalizers); + if (ptls2 != NULL) + sweep_finalizer_list(&ptls2->finalizers); } if (prev_sweep_full) { sweep_finalizer_list(&finalizer_list_marked); @@ -3459,15 +3327,13 @@ static int _jl_gc_collect(jl_ptls_t ptls, jl_gc_collection_t collection) assert(gc_n_threads); for (int i = 0; i < gc_n_threads; i++) { jl_ptls_t ptls2 = gc_all_tls_states[i]; - if (ptls2 == NULL) - continue; - gc_mark_queue_finlist(gc_cache, &sp, &ptls2->finalizers, 0); + if (ptls2 != NULL) + gc_mark_finlist(mq, &ptls2->finalizers, 0); } - gc_mark_queue_finlist(gc_cache, &sp, &finalizer_list_marked, orig_marked_len); + gc_mark_finlist(mq, &finalizer_list_marked, orig_marked_len); // "Flush" the mark stack before flipping the reset_age bit // so that the objects are not incorrectly reset. - gc_mark_loop(ptls, sp); - gc_mark_sp_init(gc_cache, &sp); + gc_mark_loop_serial(ptls); // Conservative marking relies on age to tell allocated objects // and freelist entries apart. mark_reset_age = !jl_gc_conservative_gc_support_enabled(); @@ -3475,9 +3341,10 @@ static int _jl_gc_collect(jl_ptls_t ptls, jl_gc_collection_t collection) // `to_finalize` list. These objects are only reachable from this list // and should not be referenced by any old objects so this won't break // the GC invariant. - gc_mark_queue_finlist(gc_cache, &sp, &to_finalize, 0); - gc_mark_loop(ptls, sp); + gc_mark_finlist(mq, &to_finalize, 0); + gc_mark_loop_serial(ptls); mark_reset_age = 0; + gc_settime_postmark_end(); // Flush everything in mark cache @@ -3502,9 +3369,8 @@ static int _jl_gc_collect(jl_ptls_t ptls, jl_gc_collection_t collection) assert(gc_n_threads); for (int i = 0; i < gc_n_threads; i++) { jl_ptls_t ptls2 = gc_all_tls_states[i]; - if (ptls2 == NULL) - continue; - nptr += ptls2->heap.remset_nptr; + if (ptls2 != NULL) + nptr += ptls2->heap.remset_nptr; } // many pointers in the intergen frontier => "quick" mark is not quick @@ -3525,7 +3391,7 @@ static int _jl_gc_collect(jl_ptls_t ptls, jl_gc_collection_t collection) size_t maxmem = 0; #ifdef _P64 // on a big memory machine, increase max_collect_interval to totalmem / nthreads / 2 - maxmem = total_mem / gc_n_threads / 2; + maxmem = total_mem / (gc_n_threads - jl_n_gcthreads) / 2; #endif if (maxmem < max_collect_interval) maxmem = max_collect_interval; @@ -3557,8 +3423,8 @@ static int _jl_gc_collect(jl_ptls_t ptls, jl_gc_collection_t collection) last_long_collect_interval = gc_num.interval; } scanned_bytes = 0; - // 5. start sweeping pool_live_bytes = 0; + // 6. start sweeping uint64_t start_sweep_time = jl_hrtime(); JL_PROBE_GC_SWEEP_BEGIN(sweep_full); sweep_weak_refs(); @@ -3585,7 +3451,7 @@ static int _jl_gc_collect(jl_ptls_t ptls, jl_gc_collection_t collection) } // sweeping is over - // 6. if it is a quick sweep, put back the remembered objects in queued state + // 7. if it is a quick sweep, put back the remembered objects in queued state // so that we don't trigger the barrier again on them. assert(gc_n_threads); for (int t_i = 0; t_i < gc_n_threads; t_i++) { @@ -3619,7 +3485,6 @@ static int _jl_gc_collect(jl_ptls_t ptls, jl_gc_collection_t collection) } #endif - _report_gc_finished(pause, gc_num.freed, sweep_full, recollect); gc_final_pause_end(gc_start_time, gc_end_time); @@ -3637,19 +3502,19 @@ static int _jl_gc_collect(jl_ptls_t ptls, jl_gc_collection_t collection) if (collection == JL_GC_AUTO) { //If we aren't freeing enough or are seeing lots and lots of pointers let it increase faster - if(!not_freed_enough || large_frontier) { + if (!not_freed_enough || large_frontier) { int64_t tot = 2 * (live_bytes + gc_num.since_sweep) / 3; if (gc_num.interval > tot) { gc_num.interval = tot; last_long_collect_interval = tot; } + } // If the current interval is larger than half the live data decrease the interval - } else { + else { int64_t half = (live_bytes / 2); if (gc_num.interval > half) gc_num.interval = half; } - // But never go below default if (gc_num.interval < default_collect_interval) gc_num.interval = default_collect_interval; } @@ -3771,16 +3636,15 @@ JL_DLLEXPORT void jl_gc_collect(jl_gc_collection_t collection) errno = last_errno; } -void gc_mark_queue_all_roots(jl_ptls_t ptls, jl_gc_mark_sp_t *sp) +void gc_mark_queue_all_roots(jl_ptls_t ptls, jl_gc_markqueue_t *mq) { - jl_gc_mark_cache_t *gc_cache = &ptls->gc_cache; assert(gc_n_threads); for (size_t i = 0; i < gc_n_threads; i++) { jl_ptls_t ptls2 = gc_all_tls_states[i]; - if (ptls2) - jl_gc_queue_thread_local(gc_cache, sp, ptls2); + if (ptls2 != NULL) + gc_queue_thread_local(mq, ptls2); } - mark_roots(gc_cache, sp); + gc_mark_roots(mq); } // allocator entry points @@ -3817,10 +3681,20 @@ void jl_init_thread_heap(jl_ptls_t ptls) gc_cache->perm_scanned_bytes = 0; gc_cache->scanned_bytes = 0; gc_cache->nbig_obj = 0; - size_t init_size = 1024; - gc_cache->pc_stack = (void**)malloc_s(init_size * sizeof(void*)); - gc_cache->pc_stack_end = gc_cache->pc_stack + init_size; - gc_cache->data_stack = (jl_gc_mark_data_t *)malloc_s(init_size * sizeof(jl_gc_mark_data_t)); + + // Initialize GC mark-queue + jl_gc_markqueue_t *mq = &ptls->mark_queue; + ws_queue_t *cq = &mq->chunk_queue; + ws_array_t *wsa = create_ws_array(GC_CHUNK_QUEUE_INIT_SIZE, sizeof(jl_gc_chunk_t)); + jl_atomic_store_relaxed(&cq->top, 0); + jl_atomic_store_relaxed(&cq->bottom, 0); + jl_atomic_store_relaxed(&cq->array, wsa); + ws_queue_t *q = &mq->ptr_queue; + ws_array_t *wsa2 = create_ws_array(GC_PTR_QUEUE_INIT_SIZE, sizeof(jl_value_t *)); + jl_atomic_store_relaxed(&q->top, 0); + jl_atomic_store_relaxed(&q->bottom, 0); + jl_atomic_store_relaxed(&q->array, wsa2); + arraylist_new(&mq->reclaim_set, 32); memset(&ptls->gc_num, 0, sizeof(ptls->gc_num)); jl_atomic_store_relaxed(&ptls->gc_num.allocd, -(int64_t)gc_num.interval); @@ -3833,6 +3707,8 @@ void jl_gc_init(void) JL_MUTEX_INIT(&finalizers_lock); uv_mutex_init(&gc_cache_lock); uv_mutex_init(&gc_perm_lock); + uv_mutex_init(&gc_threads_lock); + uv_cond_init(&gc_threads_cond); jl_gc_init_page(); jl_gc_debug_init(); @@ -3864,9 +3740,6 @@ void jl_gc_init(void) #endif if (jl_options.heap_size_hint) jl_gc_set_max_memory(jl_options.heap_size_hint); - - jl_gc_mark_sp_t sp = {NULL, NULL, NULL, NULL}; - gc_mark_loop(NULL, sp); t_start = jl_hrtime(); } @@ -3894,7 +3767,7 @@ JL_DLLEXPORT void *jl_gc_counted_malloc(size_t sz) { jl_gcframe_t **pgcstack = jl_get_pgcstack(); jl_task_t *ct = jl_current_task; - if (pgcstack && ct->world_age) { + if (pgcstack != NULL && ct->world_age) { jl_ptls_t ptls = ct->ptls; maybe_collect(ptls); jl_atomic_store_relaxed(&ptls->gc_num.allocd, @@ -3909,7 +3782,7 @@ JL_DLLEXPORT void *jl_gc_counted_calloc(size_t nm, size_t sz) { jl_gcframe_t **pgcstack = jl_get_pgcstack(); jl_task_t *ct = jl_current_task; - if (pgcstack && ct->world_age) { + if (pgcstack != NULL && ct->world_age) { jl_ptls_t ptls = ct->ptls; maybe_collect(ptls); jl_atomic_store_relaxed(&ptls->gc_num.allocd, @@ -3925,7 +3798,7 @@ JL_DLLEXPORT void jl_gc_counted_free_with_size(void *p, size_t sz) jl_gcframe_t **pgcstack = jl_get_pgcstack(); jl_task_t *ct = jl_current_task; free(p); - if (pgcstack && ct->world_age) { + if (pgcstack != NULL && ct->world_age) { jl_ptls_t ptls = ct->ptls; jl_atomic_store_relaxed(&ptls->gc_num.freed, jl_atomic_load_relaxed(&ptls->gc_num.freed) + sz); @@ -3938,7 +3811,7 @@ JL_DLLEXPORT void *jl_gc_counted_realloc_with_old_size(void *p, size_t old, size { jl_gcframe_t **pgcstack = jl_get_pgcstack(); jl_task_t *ct = jl_current_task; - if (pgcstack && ct->world_age) { + if (pgcstack != NULL && ct->world_age) { jl_ptls_t ptls = ct->ptls; maybe_collect(ptls); if (sz < old) diff --git a/src/gc.h b/src/gc.h index cf6580bd69df3..adc89e4769dfc 100644 --- a/src/gc.h +++ b/src/gc.h @@ -42,7 +42,6 @@ extern "C" { typedef struct { uint64_t num; uint64_t next; - uint64_t min; uint64_t interv; uint64_t max; @@ -86,163 +85,33 @@ typedef struct { uint64_t last_incremental_sweep; } jl_gc_num_t; -enum { - GC_MARK_L_marked_obj, - GC_MARK_L_scan_only, - GC_MARK_L_finlist, - GC_MARK_L_objarray, - GC_MARK_L_array8, - GC_MARK_L_array16, - GC_MARK_L_obj8, - GC_MARK_L_obj16, - GC_MARK_L_obj32, - GC_MARK_L_stack, - GC_MARK_L_excstack, - GC_MARK_L_module_binding, - _GC_MARK_L_MAX -}; - -// The following structs (`gc_mark_*_t`) contain iterator state used for the -// scanning of various object types. -// -// The `nptr` member records the number of pointers slots referenced by -// an object to be used in the full collection heuristics as well as whether the object -// references young objects. -// `nptr >> 2` is the number of pointers fields referenced by the object. -// The lowest bit of `nptr` is set if the object references young object. -// The 2nd lowest bit of `nptr` is the GC old bits of the object after marking. -// A `0x3` in the low bits means that the object needs to be in the remset. - -// An generic object that's marked and needs to be scanned -// The metadata might need update too (depend on the PC) -typedef struct { - jl_value_t *obj; // The object - uintptr_t tag; // The tag with the GC bits masked out - uint8_t bits; // The GC bits after tagging (`bits & 1 == 1`) -} gc_mark_marked_obj_t; - -// An object array. This can come from an array, svec, or the using array or a module -typedef struct { - jl_value_t *parent; // The parent object to trigger write barrier on. - jl_value_t **begin; // The first slot to be scanned. - jl_value_t **end; // The end address (after the last slot to be scanned) - uint32_t step; // Number of pointers to jump between marks - uintptr_t nptr; // See notes about `nptr` above. -} gc_mark_objarray_t; - -// A normal object with 8bits field descriptors -typedef struct { - jl_value_t *parent; // The parent object to trigger write barrier on. - uint8_t *begin; // Current field descriptor. - uint8_t *end; // End of field descriptor. - uintptr_t nptr; // See notes about `nptr` above. -} gc_mark_obj8_t; - -// A normal object with 16bits field descriptors -typedef struct { - jl_value_t *parent; // The parent object to trigger write barrier on. - uint16_t *begin; // Current field descriptor. - uint16_t *end; // End of field descriptor. - uintptr_t nptr; // See notes about `nptr` above. -} gc_mark_obj16_t; - -// A normal object with 32bits field descriptors -typedef struct { - jl_value_t *parent; // The parent object to trigger write barrier on. - uint32_t *begin; // Current field descriptor. - uint32_t *end; // End of field descriptor. - uintptr_t nptr; // See notes about `nptr` above. -} gc_mark_obj32_t; - -typedef struct { - jl_value_t **begin; // The first slot to be scanned. - jl_value_t **end; // The end address (after the last slot to be scanned) - uint8_t *rebegin; - gc_mark_obj8_t elem; -} gc_mark_array8_t; - -typedef struct { - jl_value_t **begin; // The first slot to be scanned. - jl_value_t **end; // The end address (after the last slot to be scanned) - uint16_t *rebegin; - gc_mark_obj16_t elem; -} gc_mark_array16_t; - -// Stack frame -typedef struct { - jl_gcframe_t *s; // The current stack frame - uint32_t i; // The current slot index in the frame - uint32_t nroots; // `nroots` fields in the frame - // Parameters to mark the copy_stack range. - uintptr_t offset; - uintptr_t lb; - uintptr_t ub; -} gc_mark_stackframe_t; - -// Exception stack data -typedef struct { - jl_excstack_t *s; // Stack of exceptions - size_t itr; // Iterator into exception stack - size_t bt_index; // Current backtrace buffer entry index - size_t jlval_index; // Index into GC managed values for current bt entry -} gc_mark_excstack_t; - -// Module bindings. This is also the beginning of module scanning. -// The loop will start marking other references in a module after the bindings are marked -typedef struct { - jl_module_t *parent; // The parent module to trigger write barrier on. - jl_binding_t **begin; // The first slot to be scanned. - jl_binding_t **end; // The end address (after the last slot to be scanned) - uintptr_t nptr; // See notes about `nptr` above. - uint8_t bits; // GC bits of the module (the bits to mark the binding buffer with) -} gc_mark_binding_t; - -// Finalizer (or object) list -typedef struct { - jl_value_t **begin; - jl_value_t **end; -} gc_mark_finlist_t; - -// This is used to determine the max size of the data objects on the data stack. -// We'll use this size to determine the size of the data stack corresponding to a -// PC stack size. Since the data objects are not all of the same size, we'll waste -// some memory on the data stack this way but that size is unlikely going to be significant. -union _jl_gc_mark_data { - gc_mark_marked_obj_t marked; - gc_mark_objarray_t objarray; - gc_mark_array8_t array8; - gc_mark_array16_t array16; - gc_mark_obj8_t obj8; - gc_mark_obj16_t obj16; - gc_mark_obj32_t obj32; - gc_mark_stackframe_t stackframe; - gc_mark_excstack_t excstackframe; - gc_mark_binding_t binding; - gc_mark_finlist_t finlist; -}; - -// Pop a data struct from the mark data stack (i.e. decrease the stack pointer) -// This should be used after dispatch and therefore the pc stack pointer is already popped from -// the stack. -STATIC_INLINE void *gc_pop_markdata_(jl_gc_mark_sp_t *sp, size_t size) -{ - jl_gc_mark_data_t *data = (jl_gc_mark_data_t *)(((char*)sp->data) - size); - sp->data = data; - return data; -} -#define gc_pop_markdata(sp, type) ((type*)gc_pop_markdata_(sp, sizeof(type))) - -// Re-push a frame to the mark stack (both data and pc) -// The data and pc are expected to be on the stack (or updated in place) already. -// Mainly useful to pause the current scanning in order to scan an new object. -STATIC_INLINE void *gc_repush_markdata_(jl_gc_mark_sp_t *sp, size_t size) JL_NOTSAFEPOINT -{ - jl_gc_mark_data_t *data = sp->data; - sp->pc++; - sp->data = (jl_gc_mark_data_t *)(((char*)sp->data) + size); - return data; -} -#define gc_repush_markdata(sp, type) ((type*)gc_repush_markdata_(sp, sizeof(type))) +// Array chunks (work items representing suffixes of +// large arrays of pointers left to be marked) + +typedef enum { + GC_empty_chunk = 0, // for sentinel representing no items left in chunk queue + GC_objary_chunk, // for chunk of object array + GC_ary8_chunk, // for chunk of array with 8 bit field descriptors + GC_ary16_chunk, // for chunk of array with 16 bit field descriptors + GC_finlist_chunk, // for chunk of finalizer list +} gc_chunk_id_t; + +typedef struct _jl_gc_chunk_t { + gc_chunk_id_t cid; + struct _jl_value_t *parent; // array owner + struct _jl_value_t **begin; // pointer to first element that needs scanning + struct _jl_value_t **end; // pointer to last element that needs scanning + void *elem_begin; // used to scan pointers within objects when marking `ary8` or `ary16` + void *elem_end; // used to scan pointers within objects when marking `ary8` or `ary16` + uint32_t step; // step-size used when marking objarray + uintptr_t nptr; // (`nptr` & 0x1) if array has young element and (`nptr` & 0x2) if array owner is old +} jl_gc_chunk_t; + +#define GC_CHUNK_BATCH_SIZE (1 << 16) // maximum number of references that can be processed + // without creating a chunk + +#define GC_PTR_QUEUE_INIT_SIZE (1 << 18) // initial size of queue of `jl_value_t *` +#define GC_CHUNK_QUEUE_INIT_SIZE (1 << 14) // initial size of chunk-queue // layout for big (>2k) objects @@ -511,23 +380,16 @@ STATIC_INLINE void gc_big_object_link(bigval_t *hdr, bigval_t **list) JL_NOTSAFE *list = hdr; } -STATIC_INLINE void gc_mark_sp_init(jl_gc_mark_cache_t *gc_cache, jl_gc_mark_sp_t *sp) -{ - sp->pc = gc_cache->pc_stack; - sp->data = gc_cache->data_stack; - sp->pc_start = gc_cache->pc_stack; - sp->pc_end = gc_cache->pc_stack_end; -} - -void gc_mark_queue_all_roots(jl_ptls_t ptls, jl_gc_mark_sp_t *sp); -void gc_mark_queue_finlist(jl_gc_mark_cache_t *gc_cache, jl_gc_mark_sp_t *sp, - arraylist_t *list, size_t start); -void gc_mark_loop(jl_ptls_t ptls, jl_gc_mark_sp_t sp); +void gc_mark_queue_all_roots(jl_ptls_t ptls, jl_gc_markqueue_t *mq); +void gc_mark_finlist_(jl_gc_markqueue_t *mq, jl_value_t **fl_begin, + jl_value_t **fl_end) JL_NOTSAFEPOINT; +void gc_mark_finlist(jl_gc_markqueue_t *mq, arraylist_t *list, + size_t start) JL_NOTSAFEPOINT; +void gc_mark_loop_serial_(jl_ptls_t ptls, jl_gc_markqueue_t *mq); +void gc_mark_loop_serial(jl_ptls_t ptls); void sweep_stack_pools(void); void jl_gc_debug_init(void); -extern void *gc_mark_label_addrs[_GC_MARK_L_MAX]; - // GC pages void jl_gc_init_page(void); @@ -612,7 +474,6 @@ static inline void gc_verify_tags(void) } #endif - #ifdef GC_VERIFY extern jl_value_t *lostval; void gc_verify(jl_ptls_t ptls); @@ -653,10 +514,9 @@ extern int gc_verifying; #define gc_verifying (0) #endif - int gc_slot_to_fieldidx(void *_obj, void *slot, jl_datatype_t *vt) JL_NOTSAFEPOINT; int gc_slot_to_arrayidx(void *_obj, void *begin) JL_NOTSAFEPOINT; -NOINLINE void gc_mark_loop_unwind(jl_ptls_t ptls, jl_gc_mark_sp_t sp, int pc_offset); +NOINLINE void gc_mark_loop_unwind(jl_ptls_t ptls, jl_gc_markqueue_t *mq, int offset) JL_NOTSAFEPOINT; #ifdef GC_DEBUG_ENV JL_DLLEXPORT extern jl_gc_debug_env_t jl_gc_debug_env; diff --git a/src/init.c b/src/init.c index 522f16041d566..0ab6d59e6ec6a 100644 --- a/src/init.c +++ b/src/init.c @@ -840,6 +840,7 @@ static NOINLINE void _finish_julia_init(JL_IMAGE_SEARCH rel, jl_ptls_t ptls, jl_ if (jl_base_module == NULL) { // nthreads > 1 requires code in Base jl_atomic_store_relaxed(&jl_n_threads, 1); + jl_n_gcthreads = 0; } jl_start_threads(); diff --git a/src/jl_exported_data.inc b/src/jl_exported_data.inc index fee57ed60dd7a..f28ecefbded4a 100644 --- a/src/jl_exported_data.inc +++ b/src/jl_exported_data.inc @@ -132,6 +132,7 @@ #define JL_EXPORTED_DATA_SYMBOLS(XX) \ XX(jl_n_threadpools, int) \ XX(jl_n_threads, _Atomic(int)) \ + XX(jl_n_gcthreads, int) \ XX(jl_options, jl_options_t) \ XX(jl_task_gcstack_offset, int) \ XX(jl_task_ptls_offset, int) \ diff --git a/src/jloptions.c b/src/jloptions.c index 988078c7c58d9..f325452e19e41 100644 --- a/src/jloptions.c +++ b/src/jloptions.c @@ -40,6 +40,7 @@ JL_DLLEXPORT void jl_init_options(void) NULL, // cpu_target ("native", "core2", etc...) 0, // nthreadpools 0, // nthreads + 0, // ngcthreads NULL, // nthreads_per_pool 0, // nprocs NULL, // machine_file @@ -129,6 +130,7 @@ static const char opts[] = " interface if supported (Linux and Windows) or to the number of CPU\n" " threads if not supported (MacOS) or if process affinity is not\n" " configured, and sets M to 1.\n" + " --gcthreads=N Use N threads for GC, set to half of the number of compute threads if unspecified.\n" " -p, --procs {N|auto} Integer value N launches N additional local worker processes\n" " \"auto\" launches as many workers as the number of local CPU threads (logical cores)\n" " --machine-file Run processes on hosts listed in \n\n" @@ -253,7 +255,8 @@ JL_DLLEXPORT void jl_parse_opts(int *argcp, char ***argvp) opt_strip_metadata, opt_strip_ir, opt_heap_size_hint, - opt_permalloc_pkgimg + opt_permalloc_pkgimg, + opt_gc_threads, }; static const char* const shortopts = "+vhqH:e:E:L:J:C:it:p:O:g:"; static const struct option longopts[] = { @@ -278,6 +281,7 @@ JL_DLLEXPORT void jl_parse_opts(int *argcp, char ***argvp) { "cpu-target", required_argument, 0, 'C' }, { "procs", required_argument, 0, 'p' }, { "threads", required_argument, 0, 't' }, + { "gcthreads", required_argument, 0, opt_gc_threads }, { "machine-file", required_argument, 0, opt_machine_file }, { "project", optional_argument, 0, opt_project }, { "color", required_argument, 0, opt_color }, @@ -827,6 +831,13 @@ JL_DLLEXPORT void jl_parse_opts(int *argcp, char ***argvp) else jl_errorf("julia: invalid argument to --permalloc-pkgimg={yes|no} (%s)", optarg); break; + case opt_gc_threads: + errno = 0; + long ngcthreads = strtol(optarg, &endptr, 10); + if (errno != 0 || optarg == endptr || *endptr != 0 || ngcthreads < 1 || ngcthreads >= INT16_MAX) + jl_errorf("julia: --gcthreads=; n must be an integer >= 1"); + jl_options.ngcthreads = (int16_t)ngcthreads; + break; default: jl_errorf("julia: unhandled option -- %c\n" "This is a bug, please report it.", c); diff --git a/src/jloptions.h b/src/jloptions.h index 0b72b4c3be062..93f6d321f38d6 100644 --- a/src/jloptions.h +++ b/src/jloptions.h @@ -15,6 +15,7 @@ typedef struct { const char *cpu_target; int8_t nthreadpools; int16_t nthreads; + int16_t ngcthreads; const int16_t *nthreads_per_pool; int32_t nprocs; const char *machine_file; diff --git a/src/julia.h b/src/julia.h index 0a88633905324..caebdf450ed75 100644 --- a/src/julia.h +++ b/src/julia.h @@ -1684,6 +1684,7 @@ JL_DLLEXPORT jl_sym_t *jl_get_ARCH(void) JL_NOTSAFEPOINT; JL_DLLEXPORT jl_value_t *jl_get_libllvm(void) JL_NOTSAFEPOINT; extern JL_DLLIMPORT int jl_n_threadpools; extern JL_DLLIMPORT _Atomic(int) jl_n_threads; +extern JL_DLLIMPORT int jl_n_gcthreads; extern JL_DLLIMPORT int *jl_n_threads_per_pool; // environment entries diff --git a/src/julia_internal.h b/src/julia_internal.h index ce5b9409f35be..9ecc97cc64683 100644 --- a/src/julia_internal.h +++ b/src/julia_internal.h @@ -582,8 +582,8 @@ STATIC_INLINE void jl_gc_wb_buf(void *parent, void *bufptr, size_t minsz) JL_NOT } } -void jl_gc_debug_print_status(void); -JL_DLLEXPORT void jl_gc_debug_critical_error(void); +void jl_gc_debug_print_status(void) JL_NOTSAFEPOINT; +JL_DLLEXPORT void jl_gc_debug_critical_error(void) JL_NOTSAFEPOINT; void jl_print_gc_stats(JL_STREAM *s); void jl_gc_reset_alloc_count(void); uint32_t jl_get_gs_ctr(void); @@ -1173,7 +1173,7 @@ size_t rec_backtrace_ctx_dwarf(jl_bt_element_t *bt_data, size_t maxsize, bt_cont #endif JL_DLLEXPORT jl_value_t *jl_get_backtrace(void); void jl_critical_error(int sig, int si_code, bt_context_t *context, jl_task_t *ct); -JL_DLLEXPORT void jl_raise_debugger(void); +JL_DLLEXPORT void jl_raise_debugger(void) JL_NOTSAFEPOINT; int jl_getFunctionInfo(jl_frame_t **frames, uintptr_t pointer, int skipC, int noInline) JL_NOTSAFEPOINT; JL_DLLEXPORT void jl_gdblookup(void* ip) JL_NOTSAFEPOINT; void jl_print_native_codeloc(char *pre_str, uintptr_t ip) JL_NOTSAFEPOINT; diff --git a/src/julia_threads.h b/src/julia_threads.h index 847465b363a2e..a102c6904cd8c 100644 --- a/src/julia_threads.h +++ b/src/julia_threads.h @@ -4,6 +4,7 @@ #ifndef JL_THREADS_H #define JL_THREADS_H +#include "work-stealing-queue.h" #include "julia_atomics.h" #ifndef _OS_WINDOWS_ #include "pthread.h" @@ -171,16 +172,11 @@ typedef struct { arraylist_t free_stacks[JL_N_STACK_POOLS]; } jl_thread_heap_t; -// Cache of thread local change to global metadata during GC -// This is sync'd after marking. -typedef union _jl_gc_mark_data jl_gc_mark_data_t; - typedef struct { - void **pc; // Current stack address for the pc (up growing) - jl_gc_mark_data_t *data; // Current stack address for the data (up growing) - void **pc_start; // Cached value of `gc_cache->pc_stack` - void **pc_end; // Cached value of `gc_cache->pc_stack_end` -} jl_gc_mark_sp_t; + ws_queue_t chunk_queue; + ws_queue_t ptr_queue; + arraylist_t reclaim_set; +} jl_gc_markqueue_t; typedef struct { // thread local increment of `perm_scanned_bytes` @@ -198,9 +194,6 @@ typedef struct { // this makes sure that a single objects can only appear once in // the lists (the mark bit cannot be flipped to `0` without sweeping) void *big_obj[1024]; - void **pc_stack; - void **pc_stack_end; - jl_gc_mark_data_t *data_stack; } jl_gc_mark_cache_t; struct _jl_bt_element_t; @@ -266,9 +259,9 @@ typedef struct _jl_tls_states_t { #endif jl_thread_t system_id; arraylist_t finalizers; + jl_gc_markqueue_t mark_queue; jl_gc_mark_cache_t gc_cache; arraylist_t sweep_objs; - jl_gc_mark_sp_t gc_mark_sp; // Saved exception for previous *external* API call or NULL if cleared. // Access via jl_exception_occurred(). struct _jl_value_t *previous_exception; diff --git a/src/options.h b/src/options.h index 82b71431ecea0..0e6603c59d682 100644 --- a/src/options.h +++ b/src/options.h @@ -134,6 +134,9 @@ // threadpools specification #define THREADPOOLS_NAME "JULIA_THREADPOOLS" +// GC threads +#define NUM_GC_THREADS_NAME "JULIA_NUM_GC_THREADS" + // affinitization behavior #define MACHINE_EXCLUSIVE_NAME "JULIA_EXCLUSIVE" #define DEFAULT_MACHINE_EXCLUSIVE 0 diff --git a/src/partr.c b/src/partr.c index bbcf5048cace2..793b48481850f 100644 --- a/src/partr.c +++ b/src/partr.c @@ -78,7 +78,7 @@ JL_DLLEXPORT int jl_set_task_threadpoolid(jl_task_t *task, int8_t tpid) JL_NOTSA // GC functions used extern int jl_gc_mark_queue_obj_explicit(jl_gc_mark_cache_t *gc_cache, - jl_gc_mark_sp_t *sp, jl_value_t *obj) JL_NOTSAFEPOINT; + jl_gc_markqueue_t *mq, jl_value_t *obj) JL_NOTSAFEPOINT; // parallel task runtime // --- @@ -108,7 +108,37 @@ void jl_init_threadinginfra(void) void JL_NORETURN jl_finish_task(jl_task_t *t); -// thread function: used by all except the main thread +extern uv_mutex_t gc_threads_lock; +extern uv_cond_t gc_threads_cond; +extern _Atomic(int) gc_n_threads_marking; +extern void gc_mark_loop_parallel(jl_ptls_t ptls, int master); + +// gc thread function +void jl_gc_threadfun(void *arg) +{ + jl_threadarg_t *targ = (jl_threadarg_t*)arg; + + // initialize this thread (set tid and create heap) + jl_ptls_t ptls = jl_init_threadtls(targ->tid); + + // wait for all threads + jl_gc_state_set(ptls, JL_GC_STATE_WAITING, 0); + uv_barrier_wait(targ->barrier); + + // free the thread argument here + free(targ); + + while (1) { + uv_mutex_lock(&gc_threads_lock); + while (jl_atomic_load(&gc_n_threads_marking) == 0) { + uv_cond_wait(&gc_threads_cond, &gc_threads_lock); + } + uv_mutex_unlock(&gc_threads_lock); + gc_mark_loop_parallel(ptls, 0); + } +} + +// thread function: used by all mutator threads except the main thread void jl_threadfun(void *arg) { jl_threadarg_t *targ = (jl_threadarg_t*)arg; @@ -419,7 +449,6 @@ JL_DLLEXPORT jl_task_t *jl_task_get_next(jl_value_t *trypoptask, jl_value_t *q, uv_mutex_lock(&ptls->sleep_lock); while (may_sleep(ptls)) { uv_cond_wait(&ptls->wake_signal, &ptls->sleep_lock); - // TODO: help with gc work here, if applicable } assert(jl_atomic_load_relaxed(&ptls->sleep_check_state) == not_sleeping); uv_mutex_unlock(&ptls->sleep_lock); diff --git a/src/support/MurmurHash3.c b/src/support/MurmurHash3.c index 43bd015ddd69f..1f7056d0e7e56 100644 --- a/src/support/MurmurHash3.c +++ b/src/support/MurmurHash3.c @@ -12,8 +12,6 @@ //----------------------------------------------------------------------------- // Platform-specific functions and macros -#define FORCE_INLINE inline __attribute__((always_inline)) - static inline uint32_t rotl32 ( uint32_t x, int8_t r ) { return (x << r) | (x >> (32 - r)); diff --git a/src/support/dtypes.h b/src/support/dtypes.h index d49ae0b22b5f9..891c091413084 100644 --- a/src/support/dtypes.h +++ b/src/support/dtypes.h @@ -117,6 +117,7 @@ typedef intptr_t ssize_t; #define LLT_FREE(x) free(x) #define STATIC_INLINE static inline +#define FORCE_INLINE static inline __attribute__((always_inline)) #if defined(_OS_WINDOWS_) && !defined(_COMPILER_GCC_) # define NOINLINE __declspec(noinline) diff --git a/src/threading.c b/src/threading.c index 8500279220825..7ed8e3b6e7dc9 100644 --- a/src/threading.c +++ b/src/threading.c @@ -562,6 +562,8 @@ static void jl_check_tls(void) JL_DLLEXPORT const int jl_tls_elf_support = 0; #endif +extern int gc_first_tid; + // interface to Julia; sets up to make the runtime thread-safe void jl_init_threading(void) { @@ -614,13 +616,32 @@ void jl_init_threading(void) } } - jl_all_tls_states_size = nthreads + nthreadsi; + int16_t ngcthreads = jl_options.ngcthreads - 1; + if (ngcthreads == -1 && + (cp = getenv(NUM_GC_THREADS_NAME))) { // ENV[NUM_GC_THREADS_NAME] specified + + ngcthreads = (uint64_t)strtol(cp, NULL, 10) - 1; + } + if (ngcthreads == -1) { + // if `--gcthreads` was not specified, set the number of GC threads + // to the number of compute threads + if (nthreads <= 1) { + ngcthreads = 0; + } + else { + ngcthreads = nthreads - 1; + } + } + + jl_all_tls_states_size = nthreads + nthreadsi + ngcthreads; jl_n_threads_per_pool = (int*)malloc_s(2 * sizeof(int)); jl_n_threads_per_pool[0] = nthreadsi; jl_n_threads_per_pool[1] = nthreads; jl_atomic_store_release(&jl_all_tls_states, (jl_ptls_t*)calloc(jl_all_tls_states_size, sizeof(jl_ptls_t))); jl_atomic_store_release(&jl_n_threads, jl_all_tls_states_size); + jl_n_gcthreads = ngcthreads; + gc_first_tid = nthreads; } static uv_barrier_t thread_init_done; @@ -628,6 +649,7 @@ static uv_barrier_t thread_init_done; void jl_start_threads(void) { int nthreads = jl_atomic_load_relaxed(&jl_n_threads); + int ngcthreads = jl_n_gcthreads; int cpumasksize = uv_cpumask_size(); char *cp; int i, exclusive; @@ -660,15 +682,23 @@ void jl_start_threads(void) // create threads uv_barrier_init(&thread_init_done, nthreads); + // GC/System threads need to be after the worker threads. + int nworker_threads = nthreads - ngcthreads; + for (i = 1; i < nthreads; ++i) { jl_threadarg_t *t = (jl_threadarg_t *)malloc_s(sizeof(jl_threadarg_t)); // ownership will be passed to the thread t->tid = i; t->barrier = &thread_init_done; - uv_thread_create(&uvtid, jl_threadfun, t); - if (exclusive) { - mask[i] = 1; - uv_thread_setaffinity(&uvtid, mask, NULL, cpumasksize); - mask[i] = 0; + if (i < nworker_threads) { + uv_thread_create(&uvtid, jl_threadfun, t); + if (exclusive) { + mask[i] = 1; + uv_thread_setaffinity(&uvtid, mask, NULL, cpumasksize); + mask[i] = 0; + } + } + else { + uv_thread_create(&uvtid, jl_gc_threadfun, t); } uv_thread_detach(&uvtid); } diff --git a/src/threading.h b/src/threading.h index 9fd63f0fd188d..1cf2d4a9d3711 100644 --- a/src/threading.h +++ b/src/threading.h @@ -25,6 +25,7 @@ jl_ptls_t jl_init_threadtls(int16_t tid); // provided by a threading infrastructure void jl_init_threadinginfra(void); +void jl_gc_threadfun(void *arg); void jl_threadfun(void *arg); #ifdef __cplusplus diff --git a/src/work-stealing-queue.h b/src/work-stealing-queue.h new file mode 100644 index 0000000000000..38429e02886e9 --- /dev/null +++ b/src/work-stealing-queue.h @@ -0,0 +1,102 @@ +// This file is a part of Julia. License is MIT: https://julialang.org/license + +#ifndef WORK_STEALING_QUEUE_H +#define WORK_STEALING_QUEUE_H + +#include "julia_atomics.h" +#include "assert.h" + +#ifdef __cplusplus +extern "C" { +#endif + +// ======= +// Chase and Lev's work-stealing queue, optimized for +// weak memory models by Le et al. +// +// * Chase D., Lev Y. Dynamic Circular Work-Stealing queue +// * Le N. M. et al. Correct and Efficient Work-Stealing for +// Weak Memory Models +// ======= + +typedef struct { + char *buffer; + int32_t capacity; + int32_t mask; +} ws_array_t; + +static inline ws_array_t *create_ws_array(size_t capacity, int32_t eltsz) JL_NOTSAFEPOINT +{ + ws_array_t *a = (ws_array_t *)malloc_s(sizeof(ws_array_t)); + a->buffer = (char *)malloc_s(capacity * eltsz); + a->capacity = capacity; + a->mask = capacity - 1; + return a; +} + +typedef struct { + _Atomic(int64_t) top; + _Atomic(int64_t) bottom; + _Atomic(ws_array_t *) array; +} ws_queue_t; + +static inline ws_array_t *ws_queue_push(ws_queue_t *q, void *elt, int32_t eltsz) JL_NOTSAFEPOINT +{ + int64_t b = jl_atomic_load_relaxed(&q->bottom); + int64_t t = jl_atomic_load_acquire(&q->top); + ws_array_t *ary = jl_atomic_load_relaxed(&q->array); + ws_array_t *old_ary = NULL; + if (__unlikely(b - t > ary->capacity - 1)) { + ws_array_t *new_ary = create_ws_array(2 * ary->capacity, eltsz); + for (int i = 0; i < ary->capacity; i++) { + memcpy(new_ary->buffer + ((t + i) & new_ary->mask) * eltsz, ary->buffer + ((t + i) & ary->mask) * eltsz, eltsz); + } + jl_atomic_store_release(&q->array, new_ary); + old_ary = ary; + ary = new_ary; + } + memcpy(ary->buffer + (b & ary->mask) * eltsz, elt, eltsz); + jl_fence_release(); + jl_atomic_store_relaxed(&q->bottom, b + 1); + return old_ary; +} + +static inline void ws_queue_pop(ws_queue_t *q, void *dest, int32_t eltsz) JL_NOTSAFEPOINT +{ + int64_t b = jl_atomic_load_relaxed(&q->bottom) - 1; + ws_array_t *ary = jl_atomic_load_relaxed(&q->array); + jl_atomic_store_relaxed(&q->bottom, b); + jl_fence(); + int64_t t = jl_atomic_load_relaxed(&q->top); + if (__likely(t <= b)) { + memcpy(dest, ary->buffer + (b & ary->mask) * eltsz, eltsz); + if (t == b) { + if (!jl_atomic_cmpswap(&q->top, &t, t + 1)) + memset(dest, 0, eltsz); + jl_atomic_store_relaxed(&q->bottom, b + 1); + } + } + else { + memset(dest, 0, eltsz); + jl_atomic_store_relaxed(&q->bottom, b + 1); + } +} + +static inline void ws_queue_steal_from(ws_queue_t *q, void *dest, int32_t eltsz) JL_NOTSAFEPOINT +{ + int64_t t = jl_atomic_load_acquire(&q->top); + jl_fence(); + int64_t b = jl_atomic_load_acquire(&q->bottom); + if (t < b) { + ws_array_t *ary = jl_atomic_load_relaxed(&q->array); + memcpy(dest, ary->buffer + (t & ary->mask) * eltsz, eltsz); + if (!jl_atomic_cmpswap(&q->top, &t, t + 1)) + memset(dest, 0, eltsz); + } +} + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/stdlib/Distributed/src/cluster.jl b/stdlib/Distributed/src/cluster.jl index 554a3d9185080..d8039ba160d66 100644 --- a/stdlib/Distributed/src/cluster.jl +++ b/stdlib/Distributed/src/cluster.jl @@ -1345,9 +1345,10 @@ function process_opts(opts) end # Propagate --threads to workers - threads = get_threads_spec(opts) + threads = opts.nthreads > 0 ? `--threads=$(opts.nthreads)` : `` + gcthreads = opts.ngcthreads > 0 ? `--gcthreads=$(opts.ngcthreads)` : `` - exeflags = `$threads` + exeflags = `$threads $gcthreads` # add processors if opts.nprocs > 0 diff --git a/test/choosetests.jl b/test/choosetests.jl index 334ef051a0fe6..7d426221180f5 100644 --- a/test/choosetests.jl +++ b/test/choosetests.jl @@ -21,7 +21,7 @@ const TESTNAMES = [ "combinatorics", "sysinfo", "env", "rounding", "ranges", "mod2pi", "euler", "show", "client", "errorshow", "sets", "goto", "llvmcall", "llvmcall2", "ryu", - "some", "meta", "stacktraces", "docs", + "some", "meta", "stacktraces", "docs", "gc", "misc", "threads", "stress", "binaryplatforms", "atexit", "enums", "cmdlineargs", "int", "interpreter", "checked", "bitset", "floatfuncs", "precompile", diff --git a/test/cmdlineargs.jl b/test/cmdlineargs.jl index 7f86534f923b3..f73a7854fd2f1 100644 --- a/test/cmdlineargs.jl +++ b/test/cmdlineargs.jl @@ -269,6 +269,24 @@ let exename = `$(Base.julia_cmd()) --startup-file=no --color=no` @test p.exitcode == 1 && p.termsignal == 0 end + # --gcthreads + code = "print(Threads.ngcthreads())" + cpu_threads = ccall(:jl_effective_threads, Int32, ()) + @test (cpu_threads == 1 ? "1" : string(div(cpu_threads, 2))) == + read(`$exename --threads auto -e $code`, String) == + read(`$exename --threads=auto -e $code`, String) == + read(`$exename -tauto -e $code`, String) == + read(`$exename -t auto -e $code`, String) + for nt in (nothing, "1") + withenv("JULIA_NUM_GC_THREADS" => nt) do + @test read(`$exename --gcthreads=2 -e $code`, String) == "2" + end + end + + withenv("JULIA_NUM_GC_THREADS" => 2) do + @test read(`$exename -e $code`, String) == "2" + end + # --machine-file # this does not check that machine file works, # only that the filename gets correctly passed to the option struct diff --git a/test/gc.jl b/test/gc.jl new file mode 100644 index 0000000000000..9cc9d753dfc09 --- /dev/null +++ b/test/gc.jl @@ -0,0 +1,18 @@ +# This file is a part of Julia. License is MIT: https://julialang.org/license + +using Test + +function run_gctest(file) + let cmd = `$(Base.julia_cmd()) --depwarn=error --rr-detach --startup-file=no $file` + for test_nthreads in (1, 2, 4) + new_env = copy(ENV) + new_env["JULIA_NUM_THREADS"] = string(test_nthreads) + new_env["JULIA_NUM_GC_THREADS"] = string(test_nthreads) + @time run(pipeline(setenv(cmd, new_env), stdout = stdout, stderr = stderr)) + end + end +end + +@time run_gctest("gc/binarytree.jl") +@time run_gctest("gc/linkedlist.jl") +@time run_gctest("gc/objarray.jl") diff --git a/test/gc/binarytree.jl b/test/gc/binarytree.jl new file mode 100644 index 0000000000000..3089e2d2ce869 --- /dev/null +++ b/test/gc/binarytree.jl @@ -0,0 +1,53 @@ +# This file is a part of Julia. License is MIT: https://julialang.org/license + +module BinaryTreeMutable + +# Adopted from +# https://benchmarksgame-team.pages.debian.net/benchmarksgame/description/binarytrees.html#binarytrees + +using Base.Threads +using Printf + +mutable struct Node + l::Union{Nothing, Node} + r::Union{Nothing, Node} +end + +function make(n::Int) + return n === 0 ? Node(nothing, nothing) : Node(make(n-1), make(n-1)) +end + +function check(node::Node) + return 1 + (node.l === nothing ? 0 : check(node.l) + check(node.r)) +end + +function binary_trees(io, n::Int) + @printf io "stretch tree of depth %jd\t check: %jd\n" n+1 check(make(n+1)) + + long_tree = make(n) + minDepth = 4 + resultSize = div((n - minDepth), 2) + 1 + results = Vector{String}(undef, resultSize) + Threads.@threads for depth in minDepth:2:n + c = 0 + niter = 1 << (n - depth + minDepth) + for _ in 1:niter + c += check(make(depth)) + end + index = div((depth - minDepth),2) + 1 + results[index] = @sprintf "%jd\t trees of depth %jd\t check: %jd\n" niter depth c + end + + for i in results + write(io, i) + end + + @printf io "long lived tree of depth %jd\t check: %jd\n" n check(long_tree) +end + +end #module + +using .BinaryTreeMutable + +BinaryTreeMutable.binary_trees(devnull, 20) +GC.gc() diff --git a/test/gc/linkedlist.jl b/test/gc/linkedlist.jl new file mode 100644 index 0000000000000..c447a9680326d --- /dev/null +++ b/test/gc/linkedlist.jl @@ -0,0 +1,21 @@ +# This file is a part of Julia. License is MIT: https://julialang.org/license + +mutable struct ListNode + key::Int64 + next::ListNode + ListNode() = new() + ListNode(x)= new(x) + ListNode(x,y) = new(x,y); +end + +function list(n=128) + start::ListNode = ListNode(1) + current::ListNode = start + for i = 2:(n*1024^2) + current = ListNode(i,current) + end + return current.key +end + +_ = list() +GC.gc() diff --git a/test/gc/objarray.jl b/test/gc/objarray.jl new file mode 100644 index 0000000000000..4b4cb67c42eac --- /dev/null +++ b/test/gc/objarray.jl @@ -0,0 +1,35 @@ +# This file is a part of Julia. License is MIT: https://julialang.org/license + +using Random: seed! +seed!(1) + +abstract type Cell end + +struct CellA<:Cell + a::Ref{Int} +end + +struct CellB<:Cell + b::String +end + +function fillcells!(mc::Array{Cell}) + for ind in eachindex(mc) + mc[ind] = ifelse(rand() > 0.5, CellA(ind), CellB(string(ind))) + end + return mc +end + +function work(size) + mcells = Array{Cell}(undef, size, size) + fillcells!(mcells) +end + +function run(maxsize) + Threads.@threads for i in 1:maxsize + work(i*500) + end +end + +run(4) +GC.gc()