Skip to content

Commit

Permalink
implement concurrent sweeping (#48969)
Browse files Browse the repository at this point in the history
Implements concurrent sweeping of fully empty pages.

Concurrent sweeping is disabled by default and may be enabled through the --gcthreads flag.

Co-authored-by: Valentin Churavy <v.churavy@gmail.com>
  • Loading branch information
d-netto and vchuravy authored Jun 28, 2023
1 parent ba0e484 commit 9dc2991
Show file tree
Hide file tree
Showing 13 changed files with 158 additions and 43 deletions.
3 changes: 2 additions & 1 deletion base/options.jl
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,8 @@ struct JLOptions
cpu_target::Ptr{UInt8}
nthreadpools::Int16
nthreads::Int16
ngcthreads::Int16
nmarkthreads::Int16
nsweepthreads::Int8
nthreads_per_pool::Ptr{Int16}
nprocs::Int32
machine_file::Ptr{UInt8}
Expand Down
1 change: 1 addition & 0 deletions base/threadingconstructs.jl
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,7 @@ end
Threads.ngcthreads() -> Int
Returns the number of GC threads currently configured.
This includes both mark threads and concurrent sweep threads.
"""
ngcthreads() = Int(unsafe_load(cglobal(:jl_n_gcthreads, Cint))) + 1

Expand Down
10 changes: 9 additions & 1 deletion src/gc-pages.c
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,14 @@ NOINLINE jl_gc_pagemeta_t *jl_gc_alloc_page(void) JL_NOTSAFEPOINT
#endif
jl_gc_pagemeta_t *meta = NULL;

// try to get page from `pool_lazily_freed`
meta = pop_lf_page_metadata_back(&global_page_pool_lazily_freed);
if (meta != NULL) {
gc_alloc_map_set(meta->data, 1);
// page is already mapped
return meta;
}

// try to get page from `pool_clean`
meta = pop_lf_page_metadata_back(&global_page_pool_clean);
if (meta != NULL) {
Expand All @@ -112,7 +120,7 @@ NOINLINE jl_gc_pagemeta_t *jl_gc_alloc_page(void) JL_NOTSAFEPOINT
}

uv_mutex_lock(&gc_perm_lock);
// another thread may have allocated a large block while we're waiting...
// another thread may have allocated a large block while we were waiting...
meta = pop_lf_page_metadata_back(&global_page_pool_clean);
if (meta != NULL) {
uv_mutex_unlock(&gc_perm_lock);
Expand Down
48 changes: 36 additions & 12 deletions src/gc.c
Original file line number Diff line number Diff line change
Expand Up @@ -11,12 +11,18 @@
extern "C" {
#endif

// Number of GC threads that may run parallel marking
int jl_n_markthreads;
// Number of GC threads that may run concurrent sweeping (0 or 1)
int jl_n_sweepthreads;
// Number of threads currently running the GC mark-loop
_Atomic(int) gc_n_threads_marking;
// `tid` of mutator thread that triggered GC
_Atomic(int) gc_master_tid;
// `tid` of first GC thread
int gc_first_tid;
// To indicate whether concurrent sweeping should run
uv_sem_t gc_sweep_assists_needed;

// Linked list of callback functions

Expand Down Expand Up @@ -1356,7 +1362,7 @@ static jl_taggedvalue_t **gc_sweep_page(jl_gc_pool_t *p, jl_gc_pagemeta_t **allo
int pg_skpd = 1;
if (!pg->has_marked) {
reuse_page = 0;
#ifdef _P64
#ifdef _P64 // TODO: re-enable on `_P32`?
// lazy version: (empty) if the whole page was already unused, free it (return it to the pool)
// eager version: (freedall) free page as soon as possible
// the eager one uses less memory.
Expand Down Expand Up @@ -1440,8 +1446,18 @@ static jl_taggedvalue_t **gc_sweep_page(jl_gc_pool_t *p, jl_gc_pagemeta_t **allo
push_page_metadata_back(lazily_freed, pg);
}
else {
#ifdef _P64 // only enable concurrent sweeping on 64bit
if (jl_n_sweepthreads == 0) {
jl_gc_free_page(pg);
push_lf_page_metadata_back(&global_page_pool_freed, pg);
}
else {
push_lf_page_metadata_back(&global_page_pool_lazily_freed, pg);
}
#else
jl_gc_free_page(pg);
push_lf_page_metadata_back(&global_page_pool_freed, pg);
#endif
}
gc_time_count_page(freedall, pg_skpd);
gc_num.freed += (nfree - old_nfree) * osize;
Expand Down Expand Up @@ -1561,6 +1577,13 @@ static void gc_sweep_pool(int sweep_full)
}
}

#ifdef _P64 // only enable concurrent sweeping on 64bit
// wake thread up to sweep concurrently
if (jl_n_sweepthreads > 0) {
uv_sem_post(&gc_sweep_assists_needed);
}
#endif

gc_time_pool_end(sweep_full);
}

Expand Down Expand Up @@ -2691,8 +2714,8 @@ void gc_mark_and_steal(jl_ptls_t ptls)
// of work for the mark loop
steal : {
// Try to steal chunk from random GC thread
for (int i = 0; i < 4 * jl_n_gcthreads; i++) {
uint32_t v = gc_first_tid + cong(UINT64_MAX, UINT64_MAX, &ptls->rngseed) % jl_n_gcthreads;
for (int i = 0; i < 4 * jl_n_markthreads; i++) {
uint32_t v = gc_first_tid + cong(UINT64_MAX, UINT64_MAX, &ptls->rngseed) % jl_n_markthreads;
jl_gc_markqueue_t *mq2 = &gc_all_tls_states[v]->mark_queue;
c = gc_chunkqueue_steal_from(mq2);
if (c.cid != GC_empty_chunk) {
Expand All @@ -2701,7 +2724,7 @@ void gc_mark_and_steal(jl_ptls_t ptls)
}
}
// Sequentially walk GC threads to try to steal chunk
for (int i = gc_first_tid; i < gc_first_tid + jl_n_gcthreads; i++) {
for (int i = gc_first_tid; i < gc_first_tid + jl_n_markthreads; i++) {
jl_gc_markqueue_t *mq2 = &gc_all_tls_states[i]->mark_queue;
c = gc_chunkqueue_steal_from(mq2);
if (c.cid != GC_empty_chunk) {
Expand All @@ -2718,15 +2741,15 @@ void gc_mark_and_steal(jl_ptls_t ptls)
}
}
// Try to steal pointer from random GC thread
for (int i = 0; i < 4 * jl_n_gcthreads; i++) {
uint32_t v = gc_first_tid + cong(UINT64_MAX, UINT64_MAX, &ptls->rngseed) % jl_n_gcthreads;
for (int i = 0; i < 4 * jl_n_markthreads; i++) {
uint32_t v = gc_first_tid + cong(UINT64_MAX, UINT64_MAX, &ptls->rngseed) % jl_n_markthreads;
jl_gc_markqueue_t *mq2 = &gc_all_tls_states[v]->mark_queue;
new_obj = gc_ptr_queue_steal_from(mq2);
if (new_obj != NULL)
goto mark;
}
// Sequentially walk GC threads to try to steal pointer
for (int i = gc_first_tid; i < gc_first_tid + jl_n_gcthreads; i++) {
for (int i = gc_first_tid; i < gc_first_tid + jl_n_markthreads; i++) {
jl_gc_markqueue_t *mq2 = &gc_all_tls_states[i]->mark_queue;
new_obj = gc_ptr_queue_steal_from(mq2);
if (new_obj != NULL)
Expand All @@ -2748,7 +2771,7 @@ void gc_mark_loop_parallel(jl_ptls_t ptls, int master)
jl_atomic_store(&gc_master_tid, ptls->tid);
// Wake threads up and try to do some work
jl_atomic_fetch_add(&gc_n_threads_marking, 1);
for (int i = gc_first_tid; i < gc_first_tid + jl_n_gcthreads; i++) {
for (int i = gc_first_tid; i < gc_first_tid + jl_n_markthreads; i++) {
jl_ptls_t ptls2 = gc_all_tls_states[i];
uv_mutex_lock(&ptls2->sleep_lock);
uv_cond_signal(&ptls2->wake_signal);
Expand All @@ -2771,7 +2794,7 @@ void gc_mark_loop_parallel(jl_ptls_t ptls, int master)

void gc_mark_loop(jl_ptls_t ptls)
{
if (jl_n_gcthreads == 0 || gc_heap_snapshot_enabled) {
if (jl_n_markthreads == 0 || gc_heap_snapshot_enabled) {
gc_mark_loop_serial(ptls);
}
else {
Expand Down Expand Up @@ -3065,13 +3088,13 @@ static int _jl_gc_collect(jl_ptls_t ptls, jl_gc_collection_t collection)
}

assert(gc_n_threads);
int single_threaded = (jl_n_gcthreads == 0 || gc_heap_snapshot_enabled);
int single_threaded_mark = (jl_n_markthreads == 0 || gc_heap_snapshot_enabled);
for (int t_i = 0; t_i < gc_n_threads; t_i++) {
jl_ptls_t ptls2 = gc_all_tls_states[t_i];
jl_ptls_t ptls_dest = ptls;
jl_gc_markqueue_t *mq_dest = mq;
if (!single_threaded) {
ptls_dest = gc_all_tls_states[gc_first_tid + t_i % jl_n_gcthreads];
if (!single_threaded_mark) {
ptls_dest = gc_all_tls_states[gc_first_tid + t_i % jl_n_markthreads];
mq_dest = &ptls_dest->mark_queue;
}
if (ptls2 != NULL) {
Expand Down Expand Up @@ -3513,6 +3536,7 @@ void jl_gc_init(void)
JL_MUTEX_INIT(&finalizers_lock, "finalizers_lock");
uv_mutex_init(&gc_cache_lock);
uv_mutex_init(&gc_perm_lock);
uv_sem_init(&gc_sweep_assists_needed, 0);

jl_gc_init_page();
jl_gc_debug_init();
Expand Down
2 changes: 2 additions & 0 deletions src/gc.h
Original file line number Diff line number Diff line change
Expand Up @@ -182,6 +182,7 @@ typedef struct {
_Atomic(jl_gc_pagemeta_t *) page_metadata_back;
} jl_gc_global_page_pool_t;

extern jl_gc_global_page_pool_t global_page_pool_lazily_freed;
extern jl_gc_global_page_pool_t global_page_pool_clean;
extern jl_gc_global_page_pool_t global_page_pool_freed;

Expand Down Expand Up @@ -428,6 +429,7 @@ STATIC_INLINE void gc_big_object_link(bigval_t *hdr, bigval_t **list) JL_NOTSAFE
*list = hdr;
}

extern uv_sem_t gc_sweep_assists_needed;
extern _Atomic(int) gc_n_threads_marking;
void gc_mark_queue_all_roots(jl_ptls_t ptls, jl_gc_markqueue_t *mq);
void gc_mark_finlist_(jl_gc_markqueue_t *mq, jl_value_t **fl_begin, jl_value_t **fl_end) JL_NOTSAFEPOINT;
Expand Down
23 changes: 17 additions & 6 deletions src/jloptions.c
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,8 @@ JL_DLLEXPORT void jl_init_options(void)
NULL, // cpu_target ("native", "core2", etc...)
0, // nthreadpools
0, // nthreads
0, // ngcthreads
0, // nmarkthreads
0, // nsweepthreads
NULL, // nthreads_per_pool
0, // nprocs
NULL, // machine_file
Expand Down Expand Up @@ -130,7 +131,8 @@ static const char opts[] =
" interface if supported (Linux and Windows) or to the number of CPU\n"
" threads if not supported (MacOS) or if process affinity is not\n"
" configured, and sets M to 1.\n"
" --gcthreads=N Use N threads for GC, set to half of the number of compute threads if unspecified.\n"
" --gcthreads=M[,N] Use M threads for the mark phase of GC and N (0 or 1) threads for the concurrent sweeping phase of GC.\n"
" M is set to half of the number of compute threads and N is set to 0 if unspecified.\n"
" -p, --procs {N|auto} Integer value N launches N additional local worker processes\n"
" \"auto\" launches as many workers as the number of local CPU threads (logical cores)\n"
" --machine-file <file> Run processes on hosts listed in <file>\n\n"
Expand Down Expand Up @@ -826,10 +828,19 @@ JL_DLLEXPORT void jl_parse_opts(int *argcp, char ***argvp)
break;
case opt_gc_threads:
errno = 0;
long ngcthreads = strtol(optarg, &endptr, 10);
if (errno != 0 || optarg == endptr || *endptr != 0 || ngcthreads < 1 || ngcthreads >= INT16_MAX)
jl_errorf("julia: --gcthreads=<n>; n must be an integer >= 1");
jl_options.ngcthreads = (int16_t)ngcthreads;
long nmarkthreads = strtol(optarg, &endptr, 10);
if (errno != 0 || optarg == endptr || nmarkthreads < 1 || nmarkthreads >= INT16_MAX) {
jl_errorf("julia: --gcthreads=<n>[,<m>]; n must be an integer >= 1");
}
jl_options.nmarkthreads = (int16_t)nmarkthreads;
if (*endptr == ',') {
errno = 0;
char *endptri;
long nsweepthreads = strtol(&endptr[1], &endptri, 10);
if (errno != 0 || endptri == &endptr[1] || *endptri != 0 || nsweepthreads < 0 || nsweepthreads > 1)
jl_errorf("julia: --gcthreads=<n>,<m>; n must be 0 or 1");
jl_options.nsweepthreads = (int8_t)nsweepthreads;
}
break;
case opt_permalloc_pkgimg:
if (!strcmp(optarg,"yes"))
Expand Down
3 changes: 2 additions & 1 deletion src/jloptions.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,8 @@ typedef struct {
const char *cpu_target;
int8_t nthreadpools;
int16_t nthreads;
int16_t ngcthreads;
int16_t nmarkthreads;
int8_t nsweepthreads;
const int16_t *nthreads_per_pool;
int32_t nprocs;
const char *machine_file;
Expand Down
34 changes: 31 additions & 3 deletions src/partr.c
Original file line number Diff line number Diff line change
Expand Up @@ -109,13 +109,13 @@ void jl_init_threadinginfra(void)
void JL_NORETURN jl_finish_task(jl_task_t *t);


static int may_mark(void) JL_NOTSAFEPOINT
static inline int may_mark(void) JL_NOTSAFEPOINT
{
return (jl_atomic_load(&gc_n_threads_marking) > 0);
}

// gc thread function
void jl_gc_threadfun(void *arg)
// gc thread mark function
void jl_gc_mark_threadfun(void *arg)
{
jl_threadarg_t *targ = (jl_threadarg_t*)arg;

Expand All @@ -139,6 +139,34 @@ void jl_gc_threadfun(void *arg)
}
}

// gc thread sweep function
void jl_gc_sweep_threadfun(void *arg)
{
jl_threadarg_t *targ = (jl_threadarg_t*)arg;

// initialize this thread (set tid and create heap)
jl_ptls_t ptls = jl_init_threadtls(targ->tid);

// wait for all threads
jl_gc_state_set(ptls, JL_GC_STATE_WAITING, 0);
uv_barrier_wait(targ->barrier);

// free the thread argument here
free(targ);

while (1) {
uv_sem_wait(&gc_sweep_assists_needed);
while (1) {
jl_gc_pagemeta_t *pg = pop_lf_page_metadata_back(&global_page_pool_lazily_freed);
if (pg == NULL) {
break;
}
jl_gc_free_page(pg);
push_lf_page_metadata_back(&global_page_pool_freed, pg);
}
}
}

// thread function: used by all mutator threads except the main thread
void jl_threadfun(void *arg)
{
Expand Down
46 changes: 33 additions & 13 deletions src/threading.c
Original file line number Diff line number Diff line change
Expand Up @@ -599,6 +599,8 @@ static void jl_check_tls(void)
JL_DLLEXPORT const int jl_tls_elf_support = 0;
#endif

extern int jl_n_markthreads;
extern int jl_n_sweepthreads;
extern int gc_first_tid;

// interface to Julia; sets up to make the runtime thread-safe
Expand Down Expand Up @@ -653,22 +655,37 @@ void jl_init_threading(void)
}
}

int16_t ngcthreads = jl_options.ngcthreads - 1;
if (ngcthreads == -1 &&
(cp = getenv(NUM_GC_THREADS_NAME))) { // ENV[NUM_GC_THREADS_NAME] specified

ngcthreads = (uint64_t)strtol(cp, NULL, 10) - 1;
}
if (ngcthreads == -1) {
// if `--gcthreads` was not specified, set the number of GC threads
// to half of compute threads
if (nthreads <= 1) {
ngcthreads = 0;
jl_n_markthreads = jl_options.nmarkthreads - 1;
jl_n_sweepthreads = jl_options.nsweepthreads;
if (jl_n_markthreads == -1) { // --gcthreads not specified
if ((cp = getenv(NUM_GC_THREADS_NAME))) { // ENV[NUM_GC_THREADS_NAME] specified
errno = 0;
jl_n_markthreads = (uint64_t)strtol(cp, &endptr, 10) - 1;
if (errno != 0 || endptr == cp || nthreads <= 0)
jl_n_markthreads = 0;
cp = endptr;
if (*cp == ',') {
cp++;
errno = 0;
jl_n_sweepthreads = strtol(cp, &endptri, 10);
if (errno != 0 || endptri == cp || jl_n_sweepthreads < 0) {
jl_n_sweepthreads = 0;
}
}
}
else {
ngcthreads = (nthreads / 2) - 1;
// if `--gcthreads` or ENV[NUM_GCTHREADS_NAME] was not specified,
// set the number of mark threads to half of compute threads
// and number of sweep threads to 0
if (nthreads <= 1) {
jl_n_markthreads = 0;
}
else {
jl_n_markthreads = (nthreads / 2) - 1;
}
}
}
int16_t ngcthreads = jl_n_markthreads + jl_n_sweepthreads;

jl_all_tls_states_size = nthreads + nthreadsi + ngcthreads;
jl_n_threads_per_pool = (int*)malloc_s(2 * sizeof(int));
Expand Down Expand Up @@ -734,8 +751,11 @@ void jl_start_threads(void)
mask[i] = 0;
}
}
else if (i == nthreads - 1 && jl_n_sweepthreads == 1) {
uv_thread_create(&uvtid, jl_gc_sweep_threadfun, t);
}
else {
uv_thread_create(&uvtid, jl_gc_threadfun, t);
uv_thread_create(&uvtid, jl_gc_mark_threadfun, t);
}
uv_thread_detach(&uvtid);
}
Expand Down
3 changes: 2 additions & 1 deletion src/threading.h
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,8 @@ jl_ptls_t jl_init_threadtls(int16_t tid) JL_NOTSAFEPOINT;

// provided by a threading infrastructure
void jl_init_threadinginfra(void);
void jl_gc_threadfun(void *arg);
void jl_gc_mark_threadfun(void *arg);
void jl_gc_sweep_threadfun(void *arg);
void jl_threadfun(void *arg);

#ifdef __cplusplus
Expand Down
Loading

0 comments on commit 9dc2991

Please sign in to comment.