From 1e27aaec7ff05791b97fa032348ccdc7900c69ef Mon Sep 17 00:00:00 2001 From: Gabriel Baraldi Date: Tue, 10 Sep 2024 11:19:31 -0300 Subject: [PATCH] Move assertion to correct place. --- src/gc-stacks.c | 51 ++++++++++++++++++++++++++++++++----------------- src/gc-tls.h | 1 + src/gc.c | 44 +++++++++++++++++++++++++++++++++++++++++- src/gc.h | 4 +++- src/partr.c | 12 +++++++++++- 5 files changed, 92 insertions(+), 20 deletions(-) diff --git a/src/gc-stacks.c b/src/gc-stacks.c index 1c1e2869607620..78179c5d949438 100644 --- a/src/gc-stacks.c +++ b/src/gc-stacks.c @@ -190,7 +190,7 @@ JL_DLLEXPORT void *jl_malloc_stack(size_t *bufsz, jl_task_t *owner) JL_NOTSAFEPO return stk; } -void sweep_stack_pools(void) +void sweep_stack_pool_loop(void) JL_NOTSAFEPOINT { // Stack sweeping algorithm: // // deallocate stacks if we have too many sitting around unused @@ -203,27 +203,43 @@ void sweep_stack_pools(void) // bufsz = t->bufsz // if (stkbuf) // push(free_stacks[sz], stkbuf) - assert(gc_n_threads); - for (int i = 0; i < gc_n_threads; i++) { + jl_atomic_fetch_add(&gc_n_threads_sweeping, 1); + while (1) { + int i = jl_atomic_fetch_add_relaxed(&gc_ptls_sweep_idx, -1); + if (i < 0) + break; jl_ptls_t ptls2 = gc_all_tls_states[i]; + if (ptls2 == NULL) + continue; // free half of stacks that remain unused since last sweep - for (int p = 0; p < JL_N_STACK_POOLS; p++) { - small_arraylist_t *al = &ptls2->gc_tls.heap.free_stacks[p]; - size_t n_to_free; - if (al->len > MIN_STACK_MAPPINGS_PER_POOL) { - n_to_free = al->len / 2; - if (n_to_free > (al->len - MIN_STACK_MAPPINGS_PER_POOL)) - n_to_free = al->len - MIN_STACK_MAPPINGS_PER_POOL; - } - else { - n_to_free = 0; - } - for (int n = 0; n < n_to_free; n++) { - void *stk = small_arraylist_pop(al); - free_stack(stk, pool_sizes[p]); + if (i == jl_atomic_load_relaxed(&gc_stack_free_idx)) { + for (int p = 0; p < JL_N_STACK_POOLS; p++) { + small_arraylist_t *al = &ptls2->gc_tls.heap.free_stacks[p]; + size_t n_to_free; + if (jl_atomic_load_relaxed(&ptls2->current_task) == NULL) { + n_to_free = al->len; // not alive yet or dead, so it does not need these anymore + } + else if (al->len > MIN_STACK_MAPPINGS_PER_POOL) { + n_to_free = al->len / 2; + if (n_to_free > (al->len - MIN_STACK_MAPPINGS_PER_POOL)) + n_to_free = al->len - MIN_STACK_MAPPINGS_PER_POOL; + } + else { + n_to_free = 0; + } + for (int n = 0; n < n_to_free; n++) { + void *stk = small_arraylist_pop(al); + free_stack(stk, pool_sizes[p]); + } + if (jl_atomic_load_relaxed(&ptls2->current_task) == NULL) { + small_arraylist_free(al); + } } } + if (jl_atomic_load_relaxed(&ptls2->current_task) == NULL) { + small_arraylist_free(ptls2->gc_tls.heap.free_stacks); + } small_arraylist_t *live_tasks = &ptls2->gc_tls.heap.live_tasks; size_t n = 0; @@ -264,6 +280,7 @@ void sweep_stack_pools(void) } live_tasks->len -= ndel; } + jl_atomic_fetch_add(&gc_n_threads_sweeping, -1); } JL_DLLEXPORT jl_array_t *jl_live_tasks(void) diff --git a/src/gc-tls.h b/src/gc-tls.h index a9f711198e9142..6c9b814ca5661f 100644 --- a/src/gc-tls.h +++ b/src/gc-tls.h @@ -82,6 +82,7 @@ typedef struct { jl_gc_markqueue_t mark_queue; jl_gc_mark_cache_t gc_cache; _Atomic(size_t) gc_sweeps_requested; + _Atomic(uint8_t) gc_stack_sweep_requested; arraylist_t sweep_objs; } jl_gc_tls_states_t; diff --git a/src/gc.c b/src/gc.c index dad57687325450..9b174df9e15b68 100644 --- a/src/gc.c +++ b/src/gc.c @@ -26,6 +26,10 @@ _Atomic(int) gc_n_threads_sweeping; _Atomic(jl_gc_padded_page_stack_t *) gc_allocd_scratch; // `tid` of mutator thread that triggered GC _Atomic(int) gc_master_tid; +// counter for sharing work when sweeping stacks +_Atomic(int) gc_ptls_sweep_idx; +// counter for round robin of giving back stack pages to the OS +_Atomic(int) gc_stack_free_idx; // `tid` of first GC thread int gc_first_tid; // Mutex/cond used to synchronize wakeup of GC threads on parallel marking @@ -1525,6 +1529,44 @@ static void gc_sweep_other(jl_ptls_t ptls, int sweep_full) JL_NOTSAFEPOINT gc_num.total_sweep_free_mallocd_memory_time += t_free_mallocd_memory_end - t_free_mallocd_memory_start; } +// wake up all threads to sweep the stacks +void gc_sweep_wake_all_stacks(jl_ptls_t ptls) JL_NOTSAFEPOINT +{ + uv_mutex_lock(&gc_threads_lock); + int first = gc_first_parallel_collector_thread_id(); + int last = gc_last_parallel_collector_thread_id(); + for (int i = first; i <= last; i++) { + jl_ptls_t ptls2 = gc_all_tls_states[i]; + gc_check_ptls_of_parallel_collector_thread(ptls2); + jl_atomic_fetch_add(&ptls2->gc_tls.gc_stack_sweep_requested, 1); + } + uv_cond_broadcast(&gc_threads_cond); + uv_mutex_unlock(&gc_threads_lock); + return; +} + +void gc_sweep_wait_for_all_stacks(void) JL_NOTSAFEPOINT +{ + while ((jl_atomic_load_acquire(&gc_ptls_sweep_idx)>= 0 ) || jl_atomic_load_acquire(&gc_n_threads_sweeping) != 0) { + jl_cpu_pause(); + } +} + +void sweep_stack_pools(jl_ptls_t ptls) JL_NOTSAFEPOINT +{ + // initialize ptls index for parallel sweeping of stack pools + assert(gc_n_threads); + int stack_free_idx = jl_atomic_load_relaxed(&gc_stack_free_idx); + if (stack_free_idx + 1 == gc_n_threads) + jl_atomic_store_relaxed(&gc_stack_free_idx, 0); + else + jl_atomic_store_relaxed(&gc_stack_free_idx, stack_free_idx + 1); + jl_atomic_store_release(&gc_ptls_sweep_idx, gc_n_threads - 1); // idx == gc_n_threads = release stacks to the OS so it's serial + gc_sweep_wake_all_stacks(ptls); + sweep_stack_pool_loop(); + gc_sweep_wait_for_all_stacks(); +} + static void gc_pool_sync_nfree(jl_gc_pagemeta_t *pg, jl_taggedvalue_t *last) JL_NOTSAFEPOINT { assert(pg->fl_begin_offset != UINT16_MAX); @@ -3604,7 +3646,7 @@ static int _jl_gc_collect(jl_ptls_t ptls, jl_gc_collection_t collection) #endif current_sweep_full = sweep_full; sweep_weak_refs(); - sweep_stack_pools(); + sweep_stack_pools(ptls); gc_sweep_foreign_objs(); gc_sweep_other(ptls, sweep_full); gc_scrub(); diff --git a/src/gc.h b/src/gc.h index b06deec9d72389..2d18f91cf36a2e 100644 --- a/src/gc.h +++ b/src/gc.h @@ -565,6 +565,8 @@ extern uv_cond_t gc_threads_cond; extern uv_sem_t gc_sweep_assists_needed; extern _Atomic(int) gc_n_threads_marking; extern _Atomic(int) gc_n_threads_sweeping; +extern _Atomic(int) gc_ptls_sweep_idx; +extern _Atomic(int) gc_stack_free_idx; extern uv_barrier_t thread_init_done; void gc_mark_queue_all_roots(jl_ptls_t ptls, jl_gc_markqueue_t *mq); void gc_mark_finlist_(jl_gc_markqueue_t *mq, jl_value_t *fl_parent, jl_value_t **fl_begin, jl_value_t **fl_end) JL_NOTSAFEPOINT; @@ -574,7 +576,7 @@ void gc_mark_loop_serial(jl_ptls_t ptls); void gc_mark_loop_parallel(jl_ptls_t ptls, int master); void gc_sweep_pool_parallel(jl_ptls_t ptls); void gc_free_pages(void); -void sweep_stack_pools(void); +void sweep_stack_pool_loop(void) JL_NOTSAFEPOINT; void jl_gc_debug_init(void); // GC pages diff --git a/src/partr.c b/src/partr.c index 33631dc83c05a8..937e7a3f3a9052 100644 --- a/src/partr.c +++ b/src/partr.c @@ -118,6 +118,11 @@ static inline int may_sweep(jl_ptls_t ptls) JL_NOTSAFEPOINT return (jl_atomic_load(&ptls->gc_tls.gc_sweeps_requested) > 0); } +static inline int may_sweep_stack(jl_ptls_t ptls) JL_NOTSAFEPOINT +{ + return (jl_atomic_load(&ptls->gc_tls.gc_stack_sweep_requested) > 0); +} + // parallel gc thread function void jl_parallel_gc_threadfun(void *arg) { @@ -139,12 +144,17 @@ void jl_parallel_gc_threadfun(void *arg) while (1) { uv_mutex_lock(&gc_threads_lock); - while (!may_mark() && !may_sweep(ptls)) { + while (!may_mark() && !may_sweep(ptls) && !may_sweep_stack(ptls)) { uv_cond_wait(&gc_threads_cond, &gc_threads_lock); } uv_mutex_unlock(&gc_threads_lock); assert(jl_atomic_load_relaxed(&ptls->gc_state) == JL_GC_PARALLEL_COLLECTOR_THREAD); gc_mark_loop_parallel(ptls, 0); + if (may_sweep_stack(ptls)) { + assert(jl_atomic_load_relaxed(&ptls->gc_state) == JL_GC_PARALLEL_COLLECTOR_THREAD); + sweep_stack_pool_loop(); + jl_atomic_fetch_add(&ptls->gc_tls.gc_stack_sweep_requested, -1); + } if (may_sweep(ptls)) { assert(jl_atomic_load_relaxed(&ptls->gc_state) == JL_GC_PARALLEL_COLLECTOR_THREAD); gc_sweep_pool_parallel(ptls);