From 334e4d9f5d5bb1338279bc03b83bafd842736093 Mon Sep 17 00:00:00 2001 From: Diogo Netto <61364108+d-netto@users.noreply.github.com> Date: Fri, 28 Jun 2024 12:46:50 -0300 Subject: [PATCH] simplify handling of buffered pages (#54961) Simplifies handling of buffered pages by keeping them in a single place (`global_page_pool_lazily_freed`) instead of making them thread local. Performance has been assessed on the serial & multithreaded GCBenchmarks and it has shown to be performance neutral. --- src/gc.c | 85 +++++++++++++++++---------------------------- src/gc.h | 2 +- src/julia_threads.h | 1 - 3 files changed, 32 insertions(+), 56 deletions(-) diff --git a/src/gc.c b/src/gc.c index 17f34a34716d5..c6f935c057112 100644 --- a/src/gc.c +++ b/src/gc.c @@ -1293,13 +1293,7 @@ static NOINLINE jl_taggedvalue_t *gc_add_page(jl_gc_pool_t *p) JL_NOTSAFEPOINT // Do not pass in `ptls` as argument. This slows down the fast path // in pool_alloc significantly jl_ptls_t ptls = jl_current_task->ptls; - jl_gc_pagemeta_t *pg = pop_lf_back(&ptls->page_metadata_buffered); - if (pg != NULL) { - gc_alloc_map_set(pg->data, GC_PAGE_ALLOCATED); - } - else { - pg = jl_gc_alloc_page(); - } + jl_gc_pagemeta_t *pg = jl_gc_alloc_page(); pg->osize = p->osize; pg->thread_n = ptls->tid; set_page_metadata(pg); @@ -1430,12 +1424,9 @@ STATIC_INLINE void gc_dump_page_utilization_data(void) JL_NOTSAFEPOINT } } -int64_t buffered_pages = 0; - // Walks over a page, reconstruting the free lists if the page contains at least one live object. If not, // queues up the page for later decommit (i.e. through `madvise` on Unix). -static void gc_sweep_page(gc_page_profiler_serializer_t *s, jl_gc_pool_t *p, jl_gc_page_stack_t *allocd, jl_gc_page_stack_t *buffered, - jl_gc_pagemeta_t *pg, int osize) JL_NOTSAFEPOINT +static void gc_sweep_page(gc_page_profiler_serializer_t *s, jl_gc_pool_t *p, jl_gc_page_stack_t *allocd, jl_gc_pagemeta_t *pg, int osize) JL_NOTSAFEPOINT { char *data = pg->data; jl_taggedvalue_t *v0 = (jl_taggedvalue_t*)(data + GC_PAGE_OFFSET); @@ -1451,22 +1442,10 @@ static void gc_sweep_page(gc_page_profiler_serializer_t *s, jl_gc_pool_t *p, jl_ gc_page_serializer_init(s, pg); int re_use_page = 1; - int keep_as_local_buffer = 0; int freedall = 1; int pg_skpd = 1; if (!pg->has_marked) { re_use_page = 0; - #ifdef _P64 // TODO: re-enable on `_P32`? - // lazy version: (empty) if the whole page was already unused, free it (return it to the pool) - // eager version: (freedall) free page as soon as possible - // the eager one uses less memory. - // FIXME - need to do accounting on a per-thread basis - // on quick sweeps, keep a few pages empty but allocated for performance - if (!current_sweep_full && buffered_pages <= default_collect_interval / GC_PAGE_SZ) { - buffered_pages++; - keep_as_local_buffer = 1; - } - #endif nfree = (GC_PAGE_SZ - GC_PAGE_OFFSET) / osize; gc_page_profile_write_empty_page(s, page_profile_enabled); goto done; @@ -1552,14 +1531,9 @@ static void gc_sweep_page(gc_page_profiler_serializer_t *s, jl_gc_pool_t *p, jl_ push_lf_back(allocd, pg); } else { - gc_alloc_map_set(pg->data, GC_PAGE_LAZILY_FREED); jl_atomic_fetch_add_relaxed(&gc_heap_stats.heap_size, -GC_PAGE_SZ); - if (keep_as_local_buffer) { - push_lf_back(buffered, pg); - } - else { - push_lf_back(&global_page_pool_lazily_freed, pg); - } + gc_alloc_map_set(pg->data, GC_PAGE_LAZILY_FREED); + push_lf_back(&global_page_pool_lazily_freed, pg); } gc_page_profile_write_to_file(s); gc_update_page_fragmentation_data(pg); @@ -1570,15 +1544,14 @@ static void gc_sweep_page(gc_page_profiler_serializer_t *s, jl_gc_pool_t *p, jl_ } // the actual sweeping over all allocated pages in a memory pool -STATIC_INLINE void gc_sweep_pool_page(gc_page_profiler_serializer_t *s, jl_gc_page_stack_t *allocd, jl_gc_page_stack_t *lazily_freed, - jl_gc_pagemeta_t *pg) JL_NOTSAFEPOINT +STATIC_INLINE void gc_sweep_pool_page(gc_page_profiler_serializer_t *s, jl_gc_page_stack_t *allocd, jl_gc_pagemeta_t *pg) JL_NOTSAFEPOINT { int p_n = pg->pool_n; int t_n = pg->thread_n; jl_ptls_t ptls2 = gc_all_tls_states[t_n]; jl_gc_pool_t *p = &ptls2->heap.norm_pools[p_n]; int osize = pg->osize; - gc_sweep_page(s, p, allocd, lazily_freed, pg, osize); + gc_sweep_page(s, p, allocd, pg, osize); } // sweep over all memory that is being used and not in a pool @@ -1647,7 +1620,7 @@ int gc_sweep_prescan(jl_ptls_t ptls, jl_gc_padded_page_stack_t *new_gc_allocd_sc push_lf_back_nosync(&tmp, pg); } else { - gc_sweep_pool_page(&serializer, dest, &ptls2->page_metadata_buffered, pg); + gc_sweep_pool_page(&serializer, dest, pg); } if (n_pages_to_scan >= n_pages_worth_parallel_sweep) { break; @@ -1728,7 +1701,7 @@ void gc_sweep_pool_parallel(jl_ptls_t ptls) if (pg == NULL) { continue; } - gc_sweep_pool_page(&serializer, dest, &ptls2->page_metadata_buffered, pg); + gc_sweep_pool_page(&serializer, dest, pg); found_pg = 1; } if (!found_pg) { @@ -1760,26 +1733,38 @@ void gc_sweep_pool_parallel(jl_ptls_t ptls) // free all pages (i.e. through `madvise` on Linux) that were lazily freed void gc_free_pages(void) { + size_t n_pages_seen = 0; + jl_gc_page_stack_t tmp; + memset(&tmp, 0, sizeof(tmp)); while (1) { jl_gc_pagemeta_t *pg = pop_lf_back(&global_page_pool_lazily_freed); if (pg == NULL) { break; } + n_pages_seen++; + // keep the last few pages around for a while + if (n_pages_seen * GC_PAGE_SZ <= default_collect_interval) { + push_lf_back(&tmp, pg); + continue; + } jl_gc_free_page(pg); push_lf_back(&global_page_pool_freed, pg); } -} - -void gc_move_to_global_page_pool(jl_gc_page_stack_t *pgstack) -{ - while (1) { - jl_gc_pagemeta_t *pg = pop_lf_back(pgstack); - if (pg == NULL) { - break; + // If concurrent page sweeping is disabled, then `gc_free_pages` will be called in the stop-the-world + // phase. We can guarantee, therefore, that there won't be any concurrent modifications to + // `global_page_pool_lazily_freed`, so it's safe to assign `tmp` back to `global_page_pool_lazily_freed`. + // Otherwise, we need to use the thread-safe push_lf_back/pop_lf_back functions. + if (jl_n_sweepthreads == 0) { + global_page_pool_lazily_freed = tmp; + } + else { + while (1) { + jl_gc_pagemeta_t *pg = pop_lf_back(&tmp); + if (pg == NULL) { + break; + } + push_lf_back(&global_page_pool_lazily_freed, pg); } - jl_gc_free_page(pg); - jl_atomic_fetch_add_relaxed(&gc_heap_stats.heap_size, -GC_PAGE_SZ); - push_lf_back(&global_page_pool_freed, pg); } } @@ -1787,7 +1772,6 @@ void gc_move_to_global_page_pool(jl_gc_page_stack_t *pgstack) static void gc_sweep_pool(void) { gc_time_pool_start(); - buffered_pages = 0; // For the benefit of the analyzer, which doesn't know that gc_n_threads // doesn't change over the course of this function @@ -1828,12 +1812,6 @@ static void gc_sweep_pool(void) pg->has_young = 1; } } - jl_gc_pagemeta_t *pg = jl_atomic_load_relaxed(&ptls2->page_metadata_buffered.bottom); - while (pg != NULL) { - jl_gc_pagemeta_t *pg2 = pg->next; - buffered_pages++; - pg = pg2; - } } // the actual sweeping @@ -3817,7 +3795,6 @@ static int _jl_gc_collect(jl_ptls_t ptls, jl_gc_collection_t collection) arraylist_free(&ptls2->finalizers); if (ptls2->sweep_objs.len == 0) arraylist_free(&ptls2->sweep_objs); - gc_move_to_global_page_pool(&ptls2->page_metadata_buffered); } } diff --git a/src/gc.h b/src/gc.h index 26f53d2faa471..7d78545966ecc 100644 --- a/src/gc.h +++ b/src/gc.h @@ -153,7 +153,7 @@ typedef struct _mallocarray_t { // pool page metadata typedef struct _jl_gc_pagemeta_t { // next metadata structure in per-thread list - // or in one of the `jl_gc_global_page_pool_t` + // or in one of the `jl_gc_page_stack_t` struct _jl_gc_pagemeta_t *next; // index of pool that owns this page uint8_t pool_n; diff --git a/src/julia_threads.h b/src/julia_threads.h index 9d5160ab2ff64..0064493f1d75c 100644 --- a/src/julia_threads.h +++ b/src/julia_threads.h @@ -257,7 +257,6 @@ typedef struct _jl_tls_states_t { _Atomic(int16_t) suspend_count; arraylist_t finalizers; jl_gc_page_stack_t page_metadata_allocd; - jl_gc_page_stack_t page_metadata_buffered; jl_gc_markqueue_t mark_queue; jl_gc_mark_cache_t gc_cache; arraylist_t sweep_objs;