From 282207b08411bb040746251e25e60ff7e591f422 Mon Sep 17 00:00:00 2001 From: Diogo Netto <61364108+d-netto@users.noreply.github.com> Date: Mon, 8 Jan 2024 11:29:29 -0300 Subject: [PATCH 1/7] reset mark queue indices at the end of GC (#52780) Should allow us to access fewer pages on the circular buffers owned by these work-stealing queues. --- src/gc.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/src/gc.c b/src/gc.c index 2835f847b8540..26ac86935b18e 100644 --- a/src/gc.c +++ b/src/gc.c @@ -3166,6 +3166,17 @@ void gc_mark_clean_reclaim_sets(void) free(a); } } + // Reset queue indices + for (int i = 0; i < gc_n_threads; i++) { + jl_ptls_t ptls2 = gc_all_tls_states[i]; + if (ptls2 == NULL) { + continue; + } + jl_atomic_store_relaxed(&ptls2->mark_queue.ptr_queue.bottom, 0); + jl_atomic_store_relaxed(&ptls2->mark_queue.ptr_queue.top, 0); + jl_atomic_store_relaxed(&ptls2->mark_queue.chunk_queue.bottom, 0); + jl_atomic_store_relaxed(&ptls2->mark_queue.chunk_queue.top, 0); + } } static void gc_premark(jl_ptls_t ptls2) From 1fc101db4b7894f0b04917780821749a4d9756c7 Mon Sep 17 00:00:00 2001 From: Diogo Netto <61364108+d-netto@users.noreply.github.com> Date: Fri, 8 Mar 2024 18:44:48 -0300 Subject: [PATCH 2/7] optimize remset marking (#52476) Tag the lowest bit of a pointer to indicate it's in the remset and enqueue objects in the remset for later processing when GC threads have woken up, instead of sequentially marking them all at once. In principle, this should allow for more parallelism in the mark phase, though I didn't benchmark it yet. --- src/gc.c | 21 ++++++++++----------- src/gc.h | 2 ++ 2 files changed, 12 insertions(+), 11 deletions(-) diff --git a/src/gc.c b/src/gc.c index 26ac86935b18e..7df52da0fbcef 100644 --- a/src/gc.c +++ b/src/gc.c @@ -2618,13 +2618,12 @@ JL_DLLEXPORT void jl_gc_mark_queue_objarray(jl_ptls_t ptls, jl_value_t *parent, gc_mark_objarray(ptls, parent, objs, objs + nobjs, 1, nptr); } -// Enqueue and mark all outgoing references from `new_obj` which have not been marked -// yet. `meta_updated` is mostly used to make sure we don't update metadata twice for -// objects which have been enqueued into the `remset` -FORCE_INLINE void gc_mark_outrefs(jl_ptls_t ptls, jl_gc_markqueue_t *mq, void *_new_obj, - int meta_updated) +// Enqueue and mark all outgoing references from `new_obj` which have not been marked yet. +// `_new_obj` has its lowest bit tagged if it's in the remset (in which case we shouldn't update page metadata) +FORCE_INLINE void gc_mark_outrefs(jl_ptls_t ptls, jl_gc_markqueue_t *mq, void *_new_obj) { - jl_value_t *new_obj = (jl_value_t *)_new_obj; + int meta_updated = (uintptr_t)_new_obj & GC_REMSET_PTR_TAG; + jl_value_t *new_obj = (jl_value_t *)((uintptr_t)_new_obj & ~(uintptr_t)GC_REMSET_PTR_TAG); mark_obj: { #ifdef JL_DEBUG_BUILD if (new_obj == gc_findval) @@ -2935,7 +2934,7 @@ void gc_mark_loop_serial_(jl_ptls_t ptls, jl_gc_markqueue_t *mq) if (__unlikely(new_obj == NULL)) { return; } - gc_mark_outrefs(ptls, mq, new_obj, 0); + gc_mark_outrefs(ptls, mq, new_obj); } } @@ -2984,7 +2983,7 @@ void gc_mark_and_steal(jl_ptls_t ptls) goto steal; } mark : { - gc_mark_outrefs(ptls, mq, new_obj, 0); + gc_mark_outrefs(ptls, mq, new_obj); goto pop; } // Note that for the stealing heuristics, we try to @@ -3245,9 +3244,9 @@ static void gc_queue_remset(jl_ptls_t ptls, jl_ptls_t ptls2) size_t len = ptls2->heap.last_remset->len; void **items = ptls2->heap.last_remset->items; for (size_t i = 0; i < len; i++) { - // Objects in the `remset` are already marked, - // so a `gc_try_claim_and_push` wouldn't work here - gc_mark_outrefs(ptls, &ptls->mark_queue, (jl_value_t *)items[i], 1); + // Tag the pointer to indicate it's in the remset + jl_value_t *v = (jl_value_t *)((uintptr_t)items[i] | GC_REMSET_PTR_TAG); + gc_ptr_queue_push(&ptls->mark_queue, v); } } diff --git a/src/gc.h b/src/gc.h index 161434e92442d..666fcc23339b0 100644 --- a/src/gc.h +++ b/src/gc.h @@ -116,6 +116,8 @@ typedef struct _jl_gc_chunk_t { #define GC_PTR_QUEUE_INIT_SIZE (1 << 18) // initial size of queue of `jl_value_t *` #define GC_CHUNK_QUEUE_INIT_SIZE (1 << 14) // initial size of chunk-queue +#define GC_REMSET_PTR_TAG (0x1) // lowest bit of `jl_value_t *` is tagged if it's in the remset + // layout for big (>2k) objects JL_EXTENSION typedef struct _bigval_t { From 08a4f1eeeeb47f55ef7a6a39bc034c02a93cebcc Mon Sep 17 00:00:00 2001 From: Diogo Netto <61364108+d-netto@users.noreply.github.com> Date: Thu, 25 Apr 2024 11:32:57 -0300 Subject: [PATCH 3/7] print types more verbosely in page profiler (#53256) Instead of just printing `Tuple`, print the full type signature such as `Tuple{UnionAll, GenericMemory{:not_atomic, Tuple{Int64, Int64, Array{Pair{Any, Any}, 1}}, Core.AddrSpace{Core}(0x00)}, Int64}` --- src/Makefile | 3 +- src/gc-page-profiler.c | 24 ++++++++--- src/gc-page-profiler.h | 93 +++++++++++++++++++++++++++++++++++++++--- src/gc.c | 36 +++++++++++----- src/julia_internal.h | 2 +- 5 files changed, 134 insertions(+), 24 deletions(-) diff --git a/src/Makefile b/src/Makefile index 334a9baa6ade8..0127e513ce144 100644 --- a/src/Makefile +++ b/src/Makefile @@ -302,9 +302,10 @@ $(BUILDDIR)/debuginfo.o $(BUILDDIR)/debuginfo.dbg.obj: $(addprefix $(SRCDIR)/,de $(BUILDDIR)/disasm.o $(BUILDDIR)/disasm.dbg.obj: $(SRCDIR)/debuginfo.h $(SRCDIR)/processor.h $(BUILDDIR)/gc-debug.o $(BUILDDIR)/gc-debug.dbg.obj: $(SRCDIR)/gc.h $(BUILDDIR)/gc-pages.o $(BUILDDIR)/gc-pages.dbg.obj: $(SRCDIR)/gc.h -$(BUILDDIR)/gc.o $(BUILDDIR)/gc.dbg.obj: $(SRCDIR)/gc.h $(SRCDIR)/gc-heap-snapshot.h $(SRCDIR)/gc-alloc-profiler.h +$(BUILDDIR)/gc.o $(BUILDDIR)/gc.dbg.obj: $(SRCDIR)/gc.h $(SRCDIR)/gc-heap-snapshot.h $(SRCDIR)/gc-alloc-profiler.h $(SRCDIR)/gc-page-profiler.h $(BUILDDIR)/gc-heap-snapshot.o $(BUILDDIR)/gc-heap-snapshot.dbg.obj: $(SRCDIR)/gc.h $(SRCDIR)/gc-heap-snapshot.h $(BUILDDIR)/gc-alloc-profiler.o $(BUILDDIR)/gc-alloc-profiler.dbg.obj: $(SRCDIR)/gc.h $(SRCDIR)/gc-alloc-profiler.h +$(BUILDDIR)/gc-page-profiler.o $(BUILDDIR)/gc-page-profiler.dbg.obj: $(SRCDIR)/gc-page-profiler.h $(BUILDDIR)/init.o $(BUILDDIR)/init.dbg.obj: $(SRCDIR)/builtin_proto.h $(BUILDDIR)/interpreter.o $(BUILDDIR)/interpreter.dbg.obj: $(SRCDIR)/builtin_proto.h $(BUILDDIR)/jitlayers.o $(BUILDDIR)/jitlayers.dbg.obj: $(SRCDIR)/jitlayers.h $(SRCDIR)/llvm-codegen-shared.h diff --git a/src/gc-page-profiler.c b/src/gc-page-profiler.c index 5af1c3d014770..2e876e4b7b4d6 100644 --- a/src/gc-page-profiler.c +++ b/src/gc-page-profiler.c @@ -20,6 +20,8 @@ gc_page_profiler_serializer_t gc_page_serializer_create(void) JL_NOTSAFEPOINT gc_page_profiler_serializer_t serializer; if (__unlikely(page_profile_enabled)) { arraylist_new(&serializer.typestrs, GC_PAGE_SZ); + serializer.buffers = (char *)malloc_s(GC_PAGE_PROFILER_SERIALIZER_INIT_CAPACITY); + serializer.cursor = 0; } else { serializer.typestrs.len = 0; @@ -34,6 +36,8 @@ void gc_page_serializer_init(gc_page_profiler_serializer_t *serializer, serializer->typestrs.len = 0; serializer->data = (char *)pg->data; serializer->osize = pg->osize; + serializer->cursor = 0; + serializer->capacity = GC_PAGE_PROFILER_SERIALIZER_INIT_CAPACITY; } } @@ -41,6 +45,7 @@ void gc_page_serializer_destroy(gc_page_profiler_serializer_t *serializer) JL_NO { if (__unlikely(page_profile_enabled)) { arraylist_free(&serializer->typestrs); + free(serializer->buffers); } } @@ -71,8 +76,9 @@ void gc_page_profile_write_preamble(gc_page_profiler_serializer_t *serializer) JL_NOTSAFEPOINT { if (__unlikely(page_profile_enabled)) { - char str[GC_TYPE_STR_MAXLEN]; - snprintf(str, GC_TYPE_STR_MAXLEN, + const size_t large_enough_str_size = 4096; + char str[large_enough_str_size]; + snprintf(str, large_enough_str_size, "{\"address\": \"%p\",\"object_size\": %d,\"objects\": [", serializer->data, serializer->osize); ios_write(page_profile_stream, str, strlen(str)); @@ -102,22 +108,27 @@ void gc_page_profile_write_comma(gc_page_profiler_serializer_t *serializer) JL_N void gc_page_profile_write_to_file(gc_page_profiler_serializer_t *serializer) JL_NOTSAFEPOINT { + size_t large_enough_str_size = 4096; if (__unlikely(page_profile_enabled)) { // write to file uv_mutex_lock(&page_profile_lock); gc_page_profile_write_comma(serializer); gc_page_profile_write_preamble(serializer); - char str[GC_TYPE_STR_MAXLEN]; + char *str = (char *)malloc_s(large_enough_str_size); for (size_t i = 0; i < serializer->typestrs.len; i++) { const char *name = (const char *)serializer->typestrs.items[i]; if (name == GC_SERIALIZER_EMPTY) { - snprintf(str, GC_TYPE_STR_MAXLEN, "\"empty\","); + snprintf(str, large_enough_str_size, "\"empty\","); } else if (name == GC_SERIALIZER_GARBAGE) { - snprintf(str, GC_TYPE_STR_MAXLEN, "\"garbage\","); + snprintf(str, large_enough_str_size, "\"garbage\","); } else { - snprintf(str, GC_TYPE_STR_MAXLEN, "\"%s\",", name); + while ((strlen(name) + 1) > large_enough_str_size) { + large_enough_str_size *= 2; + str = (char *)realloc_s(str, large_enough_str_size); + } + snprintf(str, large_enough_str_size, "\"%s\",", name); } // remove trailing comma for last element if (i == serializer->typestrs.len - 1) { @@ -125,6 +136,7 @@ void gc_page_profile_write_to_file(gc_page_profiler_serializer_t *serializer) } ios_write(page_profile_stream, str, strlen(str)); } + free(str); gc_page_profile_write_epilogue(serializer); page_profile_pages_written++; uv_mutex_unlock(&page_profile_lock); diff --git a/src/gc-page-profiler.h b/src/gc-page-profiler.h index b103e23905ba5..28989f8f8e206 100644 --- a/src/gc-page-profiler.h +++ b/src/gc-page-profiler.h @@ -9,22 +9,29 @@ extern "C" { #endif -#define GC_TYPE_STR_MAXLEN (512) +#define GC_PAGE_PROFILER_SERIALIZER_INIT_CAPACITY (4096) typedef struct { arraylist_t typestrs; char *data; int osize; + char *buffers; + size_t cursor; + size_t capacity; } gc_page_profiler_serializer_t; // mutex for page profile extern uv_mutex_t page_profile_lock; +// whether page profiling is enabled +extern int page_profile_enabled; // Serializer functions gc_page_profiler_serializer_t gc_page_serializer_create(void) JL_NOTSAFEPOINT; -void gc_page_serializer_init(gc_page_profiler_serializer_t *serializer, jl_gc_pagemeta_t *pg) JL_NOTSAFEPOINT; +void gc_page_serializer_init(gc_page_profiler_serializer_t *serializer, + jl_gc_pagemeta_t *pg) JL_NOTSAFEPOINT; void gc_page_serializer_destroy(gc_page_profiler_serializer_t *serializer) JL_NOTSAFEPOINT; -void gc_page_serializer_write(gc_page_profiler_serializer_t *serializer, const char *str) JL_NOTSAFEPOINT; +void gc_page_serializer_write(gc_page_profiler_serializer_t *serializer, + const char *str) JL_NOTSAFEPOINT; // Page profile functions #define GC_SERIALIZER_EMPTY ((const char *)0x1) #define GC_SERIALIZER_GARBAGE ((const char *)0x2) @@ -42,13 +49,89 @@ STATIC_INLINE void gc_page_profile_write_garbage(gc_page_profiler_serializer_t * gc_page_serializer_write(serializer, GC_SERIALIZER_GARBAGE); } } +STATIC_INLINE char *gc_page_profile_request_buffer(gc_page_profiler_serializer_t *serializer, size_t size) JL_NOTSAFEPOINT +{ + while (serializer->cursor + size >= serializer->capacity) { + serializer->capacity *= 2; + serializer->buffers = (char *)realloc_s(serializer->buffers, serializer->capacity); + } + char *p = &serializer->buffers[serializer->cursor]; + memset(p, 0, size); + serializer->cursor += size; + return p; +} STATIC_INLINE void gc_page_profile_write_live_obj(gc_page_profiler_serializer_t *serializer, jl_taggedvalue_t *v, int enabled) JL_NOTSAFEPOINT { if (__unlikely(enabled)) { - const char *name = jl_typeof_str(jl_valueof(v)); - gc_page_serializer_write(serializer, name); + jl_value_t *a = jl_valueof(v); + jl_value_t *t = jl_typeof(a); + ios_t str_; + int ios_need_close = 0; + char *type_name = NULL; + char *type_name_in_serializer = NULL; + if (t == (jl_value_t *)jl_get_buff_tag()) { + type_name = "Buffer"; + type_name_in_serializer = + gc_page_profile_request_buffer(serializer, strlen(type_name) + 1); + strcpy(type_name_in_serializer, type_name); + } + else if (jl_is_string(a)) { + type_name = "String"; + type_name_in_serializer = + gc_page_profile_request_buffer(serializer, strlen(type_name) + 1); + strcpy(type_name_in_serializer, type_name); + } + else if (jl_is_symbol(a)) { + type_name = jl_symbol_name((jl_sym_t *)a); + type_name_in_serializer = + gc_page_profile_request_buffer(serializer, strlen(type_name) + 1); + strcpy(type_name_in_serializer, type_name); + } + else if (jl_is_simplevector(a)) { + type_name = "SimpleVector"; + type_name_in_serializer = + gc_page_profile_request_buffer(serializer, strlen(type_name) + 1); + strcpy(type_name_in_serializer, type_name); + } + else if (jl_is_module(a)) { + type_name = jl_symbol_name_(((jl_module_t *)a)->name); + type_name_in_serializer = + gc_page_profile_request_buffer(serializer, strlen(type_name) + 1); + strcpy(type_name_in_serializer, type_name); + } + else if (jl_is_task(a)) { + type_name = "Task"; + type_name_in_serializer = + gc_page_profile_request_buffer(serializer, strlen(type_name) + 1); + strcpy(type_name_in_serializer, type_name); + } + else if (jl_is_datatype(a)) { + ios_need_close = 1; + ios_mem(&str_, 0); + JL_STREAM *str = (JL_STREAM *)&str_; + jl_static_show(str, a); + type_name = str_.buf; + type_name_in_serializer = + gc_page_profile_request_buffer(serializer, str_.size + 1); + memcpy(type_name_in_serializer, type_name, str_.size); + } + else { + ios_need_close = 1; + ios_mem(&str_, 0); + JL_STREAM *str = (JL_STREAM *)&str_; + jl_static_show(str, t); + type_name = str_.buf; + type_name_in_serializer = + gc_page_profile_request_buffer(serializer, str_.size + 1); + memcpy(type_name_in_serializer, type_name, str_.size); + } + gc_page_serializer_write(serializer, type_name_in_serializer); + if (ios_need_close) { + ios_close(&str_); + } + jl_may_leak(type_name_in_serializer); } } void gc_enable_page_profile(void) JL_NOTSAFEPOINT; diff --git a/src/gc.c b/src/gc.c index 7df52da0fbcef..14df2bc338aef 100644 --- a/src/gc.c +++ b/src/gc.c @@ -196,7 +196,7 @@ static size_t last_long_collect_interval; int gc_n_threads; jl_ptls_t* gc_all_tls_states; const uint64_t _jl_buff_tag[3] = {0x4eadc0004eadc000ull, 0x4eadc0004eadc000ull, 0x4eadc0004eadc000ull}; // aka 0xHEADER00 -JL_DLLEXPORT uintptr_t jl_get_buff_tag(void) +JL_DLLEXPORT uintptr_t jl_get_buff_tag(void) JL_NOTSAFEPOINT { return jl_buff_tag; } @@ -1655,17 +1655,32 @@ int gc_sweep_prescan(jl_ptls_t ptls, jl_gc_padded_page_stack_t *new_gc_allocd_sc void gc_sweep_wake_all(jl_ptls_t ptls, jl_gc_padded_page_stack_t *new_gc_allocd_scratch) { int parallel_sweep_worthwhile = gc_sweep_prescan(ptls, new_gc_allocd_scratch); - jl_atomic_store(&gc_allocd_scratch, new_gc_allocd_scratch); - if (!parallel_sweep_worthwhile) { + if (parallel_sweep_worthwhile && !page_profile_enabled) { + jl_atomic_store(&gc_allocd_scratch, new_gc_allocd_scratch); + uv_mutex_lock(&gc_threads_lock); + for (int i = gc_first_tid; i < gc_first_tid + jl_n_markthreads; i++) { + jl_ptls_t ptls2 = gc_all_tls_states[i]; + assert(ptls2 != NULL); // should be a GC thread + jl_atomic_fetch_add(&ptls2->gc_sweeps_requested, 1); + } + uv_cond_broadcast(&gc_threads_cond); + uv_mutex_unlock(&gc_threads_lock); return; } - uv_mutex_lock(&gc_threads_lock); - for (int i = gc_first_tid; i < gc_first_tid + jl_n_markthreads; i++) { - jl_ptls_t ptls2 = gc_all_tls_states[i]; - jl_atomic_fetch_add(&ptls2->gc_sweeps_requested, 1); + if (page_profile_enabled) { + // we need to ensure that no threads are running sweeping when + // collecting a page profile. + // wait for all to leave in order to ensure that a straggler doesn't + // try to enter sweeping after we set `gc_allocd_scratch` below. + for (int i = gc_first_tid; i < gc_first_tid + jl_n_markthreads; i++) { + jl_ptls_t ptls2 = gc_all_tls_states[i]; + assert(ptls2 != NULL); // should be a GC thread + while (jl_atomic_load_acquire(&ptls2->gc_sweeps_requested) != 0) { + jl_cpu_pause(); + } + } } - uv_cond_broadcast(&gc_threads_cond); - uv_mutex_unlock(&gc_threads_lock); + jl_atomic_store(&gc_allocd_scratch, new_gc_allocd_scratch); } // wait for all threads to finish sweeping @@ -1795,8 +1810,7 @@ static void gc_sweep_pool(void) } // the actual sweeping - jl_gc_padded_page_stack_t *new_gc_allocd_scratch = (jl_gc_padded_page_stack_t *) malloc_s(n_threads * sizeof(jl_gc_padded_page_stack_t)); - memset(new_gc_allocd_scratch, 0, n_threads * sizeof(jl_gc_padded_page_stack_t)); + jl_gc_padded_page_stack_t *new_gc_allocd_scratch = (jl_gc_padded_page_stack_t *) calloc_s(n_threads * sizeof(jl_gc_padded_page_stack_t)); jl_ptls_t ptls = jl_current_task->ptls; gc_sweep_wake_all(ptls, new_gc_allocd_scratch); gc_sweep_pool_parallel(ptls); diff --git a/src/julia_internal.h b/src/julia_internal.h index 41c4304a9205f..1ea84c0dc0d4f 100644 --- a/src/julia_internal.h +++ b/src/julia_internal.h @@ -535,7 +535,7 @@ JL_DLLEXPORT jl_value_t *jl_gc_alloc(jl_ptls_t ptls, size_t sz, void *ty); // defined as uint64_t[3] so that we can get the right alignment of this and a "type tag" on it const extern uint64_t _jl_buff_tag[3]; #define jl_buff_tag ((uintptr_t)LLT_ALIGN((uintptr_t)&_jl_buff_tag[1],16)) -JL_DLLEXPORT uintptr_t jl_get_buff_tag(void); +JL_DLLEXPORT uintptr_t jl_get_buff_tag(void) JL_NOTSAFEPOINT; typedef void jl_gc_tracked_buffer_t; // For the benefit of the static analyzer STATIC_INLINE jl_gc_tracked_buffer_t *jl_gc_alloc_buf(jl_ptls_t ptls, size_t sz) From 0ed80bdc4464d937362407b0b9adc7ce47beb57c Mon Sep 17 00:00:00 2001 From: Diogo Netto <61364108+d-netto@users.noreply.github.com> Date: Sun, 9 Jun 2024 18:50:56 -0300 Subject: [PATCH 4/7] fix stale docstring in gc_sweep_page (#54743) --- src/gc.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/gc.c b/src/gc.c index 14df2bc338aef..ff7f9cd657f61 100644 --- a/src/gc.c +++ b/src/gc.c @@ -1423,7 +1423,8 @@ STATIC_INLINE void gc_dump_page_utilization_data(void) JL_NOTSAFEPOINT int64_t buffered_pages = 0; -// Returns pointer to terminal pointer of list rooted at *pfl. +// Walks over a page, reconstruting the free lists if the page contains at least one live object. If not, +// queues up the page for later decommit (i.e. through `madvise` on Unix). static void gc_sweep_page(gc_page_profiler_serializer_t *s, jl_gc_pool_t *p, jl_gc_page_stack_t *allocd, jl_gc_page_stack_t *buffered, jl_gc_pagemeta_t *pg, int osize) JL_NOTSAFEPOINT { From e0726dcb720f54a34daaaf6bd10bd26f1da311f9 Mon Sep 17 00:00:00 2001 From: Diogo Netto <61364108+d-netto@users.noreply.github.com> Date: Sun, 9 Jun 2024 23:11:37 -0300 Subject: [PATCH 5/7] use atomic-fetch-and in write barrier slow-path (#54744) Not sure why we're not using it, but we probably should. --- src/gc.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/src/gc.c b/src/gc.c index ff7f9cd657f61..e4bcabdaa9ad3 100644 --- a/src/gc.c +++ b/src/gc.c @@ -1891,13 +1891,15 @@ JL_DLLEXPORT void jl_gc_queue_root(const jl_value_t *ptr) { jl_ptls_t ptls = jl_current_task->ptls; jl_taggedvalue_t *o = jl_astaggedvalue(ptr); - // The modification of the `gc_bits` is not atomic but it - // should be safe here since GC is not allowed to run here and we only - // write GC_OLD to the GC bits outside GC. This could cause - // duplicated objects in the remset but that shouldn't be a problem. - o->bits.gc = GC_MARKED; - arraylist_push(ptls->heap.remset, (jl_value_t*)ptr); - ptls->heap.remset_nptr++; // conservative + // The modification of the `gc_bits` needs to be atomic. + // We need to ensure that objects are in the remset at + // most once, since the mark phase may update page metadata, + // which is not idempotent. See comments in https://github.com/JuliaLang/julia/issues/50419 + uintptr_t header = jl_atomic_fetch_and_relaxed((_Atomic(uintptr_t) *)&o->header, ~GC_OLD); + if (header & GC_OLD) { // write barrier has not been triggered in this object yet + arraylist_push(ptls->heap.remset, (jl_value_t*)ptr); + ptls->heap.remset_nptr++; // conservative + } } void jl_gc_queue_multiroot(const jl_value_t *parent, const jl_value_t *ptr) JL_NOTSAFEPOINT From f4b9539e886c47317d99eb186060ef908bbb8bc1 Mon Sep 17 00:00:00 2001 From: Diogo Netto <61364108+d-netto@users.noreply.github.com> Date: Thu, 27 Jun 2024 01:03:07 -0300 Subject: [PATCH 6/7] remove a bunch of bit unnecessary bit clearing in bigval's sz field (#54946) We don't store anything in the lowest two bits of `sz` after https://github.com/JuliaLang/julia/pull/49644. --- src/gc.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/gc.c b/src/gc.c index e4bcabdaa9ad3..cf08652b24e10 100644 --- a/src/gc.c +++ b/src/gc.c @@ -848,11 +848,11 @@ STATIC_INLINE void gc_setmark_big(jl_ptls_t ptls, jl_taggedvalue_t *o, assert(!gc_alloc_map_is_set((char*)o)); bigval_t *hdr = bigval_header(o); if (mark_mode == GC_OLD_MARKED) { - ptls->gc_cache.perm_scanned_bytes += hdr->sz & ~3; + ptls->gc_cache.perm_scanned_bytes += hdr->sz; gc_queue_big_marked(ptls, hdr, 0); } else { - ptls->gc_cache.scanned_bytes += hdr->sz & ~3; + ptls->gc_cache.scanned_bytes += hdr->sz; // We can't easily tell if the object is old or being promoted // from the gc bits but if the `age` is `0` then the object // must be already on a young list. @@ -862,7 +862,7 @@ STATIC_INLINE void gc_setmark_big(jl_ptls_t ptls, jl_taggedvalue_t *o, } } objprofile_count(jl_typeof(jl_valueof(o)), - mark_mode == GC_OLD_MARKED, hdr->sz & ~3); + mark_mode == GC_OLD_MARKED, hdr->sz); } // This function should be called exactly once during marking for each pool @@ -1074,9 +1074,9 @@ static bigval_t **sweep_big_list(int sweep_full, bigval_t **pv) JL_NOTSAFEPOINT *pv = nxt; if (nxt) nxt->prev = pv; - gc_num.freed += v->sz&~3; + gc_num.freed += v->sz; #ifdef MEMDEBUG - memset(v, 0xbb, v->sz&~3); + memset(v, 0xbb, v->sz); #endif gc_invoke_callbacks(jl_gc_cb_notify_external_free_t, gc_cblist_notify_external_free, (v)); From 2e9911e76f52733912738c85e11588bf87a5cced Mon Sep 17 00:00:00 2001 From: Diogo Netto <61364108+d-netto@users.noreply.github.com> Date: Fri, 28 Jun 2024 12:46:50 -0300 Subject: [PATCH 7/7] simplify handling of buffered pages (#54961) Simplifies handling of buffered pages by keeping them in a single place (`global_page_pool_lazily_freed`) instead of making them thread local. Performance has been assessed on the serial & multithreaded GCBenchmarks and it has shown to be performance neutral. --- src/gc.c | 73 ++++++++++++++++++++------------------------- src/gc.h | 2 ++ src/julia_threads.h | 1 - 3 files changed, 34 insertions(+), 42 deletions(-) diff --git a/src/gc.c b/src/gc.c index cf08652b24e10..3411575dd4c16 100644 --- a/src/gc.c +++ b/src/gc.c @@ -1283,13 +1283,7 @@ static NOINLINE jl_taggedvalue_t *gc_add_page(jl_gc_pool_t *p) JL_NOTSAFEPOINT // Do not pass in `ptls` as argument. This slows down the fast path // in pool_alloc significantly jl_ptls_t ptls = jl_current_task->ptls; - jl_gc_pagemeta_t *pg = pop_lf_back(&ptls->page_metadata_buffered); - if (pg != NULL) { - gc_alloc_map_set(pg->data, GC_PAGE_ALLOCATED); - } - else { - pg = jl_gc_alloc_page(); - } + jl_gc_pagemeta_t *pg = jl_gc_alloc_page(); pg->osize = p->osize; pg->thread_n = ptls->tid; set_page_metadata(pg); @@ -1421,12 +1415,9 @@ STATIC_INLINE void gc_dump_page_utilization_data(void) JL_NOTSAFEPOINT } } -int64_t buffered_pages = 0; - // Walks over a page, reconstruting the free lists if the page contains at least one live object. If not, // queues up the page for later decommit (i.e. through `madvise` on Unix). -static void gc_sweep_page(gc_page_profiler_serializer_t *s, jl_gc_pool_t *p, jl_gc_page_stack_t *allocd, jl_gc_page_stack_t *buffered, - jl_gc_pagemeta_t *pg, int osize) JL_NOTSAFEPOINT +static void gc_sweep_page(gc_page_profiler_serializer_t *s, jl_gc_pool_t *p, jl_gc_page_stack_t *allocd, jl_gc_pagemeta_t *pg, int osize) JL_NOTSAFEPOINT { char *data = pg->data; jl_taggedvalue_t *v0 = (jl_taggedvalue_t*)(data + GC_PAGE_OFFSET); @@ -1442,22 +1433,10 @@ static void gc_sweep_page(gc_page_profiler_serializer_t *s, jl_gc_pool_t *p, jl_ gc_page_serializer_init(s, pg); int re_use_page = 1; - int keep_as_local_buffer = 0; int freedall = 1; int pg_skpd = 1; if (!pg->has_marked) { re_use_page = 0; - #ifdef _P64 // TODO: re-enable on `_P32`? - // lazy version: (empty) if the whole page was already unused, free it (return it to the pool) - // eager version: (freedall) free page as soon as possible - // the eager one uses less memory. - // FIXME - need to do accounting on a per-thread basis - // on quick sweeps, keep a few pages empty but allocated for performance - if (!current_sweep_full && buffered_pages <= default_collect_interval / GC_PAGE_SZ) { - buffered_pages++; - keep_as_local_buffer = 1; - } - #endif nfree = (GC_PAGE_SZ - GC_PAGE_OFFSET) / osize; gc_page_profile_write_empty_page(s, page_profile_enabled); goto done; @@ -1544,12 +1523,7 @@ static void gc_sweep_page(gc_page_profiler_serializer_t *s, jl_gc_pool_t *p, jl_ } else { gc_alloc_map_set(pg->data, GC_PAGE_LAZILY_FREED); - if (keep_as_local_buffer) { - push_lf_back(buffered, pg); - } - else { - push_lf_back(&global_page_pool_lazily_freed, pg); - } + push_lf_back(&global_page_pool_lazily_freed, pg); } gc_page_profile_write_to_file(s); gc_update_page_fragmentation_data(pg); @@ -1560,15 +1534,14 @@ static void gc_sweep_page(gc_page_profiler_serializer_t *s, jl_gc_pool_t *p, jl_ } // the actual sweeping over all allocated pages in a memory pool -STATIC_INLINE void gc_sweep_pool_page(gc_page_profiler_serializer_t *s, jl_gc_page_stack_t *allocd, jl_gc_page_stack_t *lazily_freed, - jl_gc_pagemeta_t *pg) JL_NOTSAFEPOINT +STATIC_INLINE void gc_sweep_pool_page(gc_page_profiler_serializer_t *s, jl_gc_page_stack_t *allocd, jl_gc_pagemeta_t *pg) JL_NOTSAFEPOINT { int p_n = pg->pool_n; int t_n = pg->thread_n; jl_ptls_t ptls2 = gc_all_tls_states[t_n]; jl_gc_pool_t *p = &ptls2->heap.norm_pools[p_n]; int osize = pg->osize; - gc_sweep_page(s, p, allocd, lazily_freed, pg, osize); + gc_sweep_page(s, p, allocd, pg, osize); } // sweep over all memory that is being used and not in a pool @@ -1634,7 +1607,7 @@ int gc_sweep_prescan(jl_ptls_t ptls, jl_gc_padded_page_stack_t *new_gc_allocd_sc push_lf_back_nosync(&tmp, pg); } else { - gc_sweep_pool_page(&serializer, dest, &ptls2->page_metadata_buffered, pg); + gc_sweep_pool_page(&serializer, dest, pg); } if (n_pages_to_scan >= n_pages_worth_parallel_sweep) { break; @@ -1715,7 +1688,7 @@ void gc_sweep_pool_parallel(jl_ptls_t ptls) if (pg == NULL) { continue; } - gc_sweep_pool_page(&serializer, dest, &ptls2->page_metadata_buffered, pg); + gc_sweep_pool_page(&serializer, dest, pg); found_pg = 1; } if (!found_pg) { @@ -1747,21 +1720,45 @@ void gc_sweep_pool_parallel(jl_ptls_t ptls) // free all pages (i.e. through `madvise` on Linux) that were lazily freed void gc_free_pages(void) { + size_t n_pages_seen = 0; + jl_gc_page_stack_t tmp; + memset(&tmp, 0, sizeof(tmp)); while (1) { jl_gc_pagemeta_t *pg = pop_lf_back(&global_page_pool_lazily_freed); if (pg == NULL) { break; } + n_pages_seen++; + // keep the last few pages around for a while + if (n_pages_seen * GC_PAGE_SZ <= default_collect_interval) { + push_lf_back(&tmp, pg); + continue; + } jl_gc_free_page(pg); push_lf_back(&global_page_pool_freed, pg); } + // If concurrent page sweeping is disabled, then `gc_free_pages` will be called in the stop-the-world + // phase. We can guarantee, therefore, that there won't be any concurrent modifications to + // `global_page_pool_lazily_freed`, so it's safe to assign `tmp` back to `global_page_pool_lazily_freed`. + // Otherwise, we need to use the thread-safe push_lf_back/pop_lf_back functions. + if (jl_n_sweepthreads == 0) { + global_page_pool_lazily_freed = tmp; + } + else { + while (1) { + jl_gc_pagemeta_t *pg = pop_lf_back(&tmp); + if (pg == NULL) { + break; + } + push_lf_back(&global_page_pool_lazily_freed, pg); + } + } } // setup the data-structures for a sweep over all memory pools static void gc_sweep_pool(void) { gc_time_pool_start(); - buffered_pages = 0; // For the benefit of the analyzer, which doesn't know that gc_n_threads // doesn't change over the course of this function @@ -1802,12 +1799,6 @@ static void gc_sweep_pool(void) pg->has_young = 1; } } - jl_gc_pagemeta_t *pg = jl_atomic_load_relaxed(&ptls2->page_metadata_buffered.bottom); - while (pg != NULL) { - jl_gc_pagemeta_t *pg2 = pg->next; - buffered_pages++; - pg = pg2; - } } // the actual sweeping diff --git a/src/gc.h b/src/gc.h index 666fcc23339b0..bcba63a613b1f 100644 --- a/src/gc.h +++ b/src/gc.h @@ -150,6 +150,8 @@ typedef struct _mallocarray_t { // pool page metadata typedef struct _jl_gc_pagemeta_t { + // next metadata structure in per-thread list + // or in one of the `jl_gc_page_stack_t` struct _jl_gc_pagemeta_t *next; // index of pool that owns this page uint8_t pool_n; diff --git a/src/julia_threads.h b/src/julia_threads.h index 56c74c2cff698..a80084a86f4b3 100644 --- a/src/julia_threads.h +++ b/src/julia_threads.h @@ -261,7 +261,6 @@ typedef struct _jl_tls_states_t { jl_thread_t system_id; arraylist_t finalizers; jl_gc_page_stack_t page_metadata_allocd; - jl_gc_page_stack_t page_metadata_buffered; jl_gc_markqueue_t mark_queue; jl_gc_mark_cache_t gc_cache; arraylist_t sweep_objs;