Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

GC scheduler refinements #52294

Merged
merged 1 commit into from
Dec 5, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
117 changes: 101 additions & 16 deletions src/gc.c
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,8 @@ uv_mutex_t gc_threads_lock;
uv_cond_t gc_threads_cond;
// To indicate whether concurrent sweeping should run
uv_sem_t gc_sweep_assists_needed;
// Mutex used to coordinate entry of GC threads in the mark loop
uv_mutex_t gc_queue_observer_lock;

// Linked list of callback functions

Expand Down Expand Up @@ -2863,8 +2865,10 @@ void gc_mark_and_steal(jl_ptls_t ptls)
jl_gc_markqueue_t *mq = &ptls->mark_queue;
jl_gc_markqueue_t *mq_master = NULL;
int master_tid = jl_atomic_load(&gc_master_tid);
if (master_tid != -1)
mq_master = &gc_all_tls_states[master_tid]->mark_queue;
if (master_tid == -1) {
return;
}
mq_master = &gc_all_tls_states[master_tid]->mark_queue;
void *new_obj;
jl_gc_chunk_t c;
pop : {
Expand Down Expand Up @@ -2939,28 +2943,108 @@ void gc_mark_and_steal(jl_ptls_t ptls)
}
}

size_t gc_count_work_in_queue(jl_ptls_t ptls) JL_NOTSAFEPOINT
{
// assume each chunk is worth 256 units of work and each pointer
// is worth 1 unit of work
size_t work = 256 * (jl_atomic_load_relaxed(&ptls->mark_queue.chunk_queue.bottom) -
jl_atomic_load_relaxed(&ptls->mark_queue.chunk_queue.top));
work += (jl_atomic_load_relaxed(&ptls->mark_queue.ptr_queue.bottom) -
jl_atomic_load_relaxed(&ptls->mark_queue.ptr_queue.top));
return work;
}

/**
* Correctness argument for the mark-loop termination protocol.
*
* Safety properties:
* - No work items shall be in any thread's queues when `gc_mark_loop_barrier` observes
* that `gc_n_threads_marking` is zero.
*
* - No work item shall be stolen from the master thread (i.e. mutator thread which started
* GC and which helped the `jl_n_markthreads` - 1 threads to mark) after
* `gc_mark_loop_barrier` observes that `gc_n_threads_marking` is zero. This property is
* necessary because we call `gc_mark_loop_serial` after marking the finalizer list in
* `_jl_gc_collect`, and want to ensure that we have the serial mark-loop semantics there,
* and that no work is stolen from us at that point.
*
* Proof:
* - Suppose the master thread observes that `gc_n_threads_marking` is zero in
* `gc_mark_loop_barrier` and there is a work item left in one thread's queue at that point.
* Since threads try to steal from all threads' queues, this implies that all threads must
* have tried to steal from the queue which still has a work item left, but failed to do so,
* which violates the semantics of Chase-Lev's work-stealing queue.
*
* - Let E1 be the event "master thread writes -1 to gc_master_tid" and E2 be the even
* "master thread observes that `gc_n_threads_marking` is zero". Since we're using
* sequentially consistent atomics, E1 => E2. Now suppose one thread which is spinning in
* `gc_should_mark` tries to enter the mark-loop after E2. In order to do so, it must
* increment `gc_n_threads_marking` to 1 in an event E3, and then read `gc_master_tid` in an
* event E4. Since we're using sequentially consistent atomics, E3 => E4. Since we observed
* `gc_n_threads_marking` as zero in E2, then E2 => E3, and we conclude E1 => E4, so that
* the thread which is spinning in `gc_should_mark` must observe that `gc_master_tid` is -1
* and therefore won't enter the mark-loop.
*/

int gc_should_mark(jl_ptls_t ptls)
{
int should_mark = 0;
gbaraldi marked this conversation as resolved.
Show resolved Hide resolved
int n_threads_marking = jl_atomic_load(&gc_n_threads_marking);
// fast path
if (n_threads_marking == 0) {
return 0;
}
uv_mutex_lock(&gc_queue_observer_lock);
while (1) {
int tid = jl_atomic_load(&gc_master_tid);
// fast path
if (tid == -1) {
break;
}
n_threads_marking = jl_atomic_load(&gc_n_threads_marking);
// fast path
if (n_threads_marking == 0) {
break;
}
size_t work = gc_count_work_in_queue(gc_all_tls_states[tid]);
for (tid = gc_first_tid; tid < gc_first_tid + jl_n_markthreads; tid++) {
work += gc_count_work_in_queue(gc_all_tls_states[tid]);
}
// if there is a lot of work left, enter the mark loop
if (work >= 16 * n_threads_marking) {
jl_atomic_fetch_add(&gc_n_threads_marking, 1);
should_mark = 1;
break;
}
jl_cpu_pause();
}
uv_mutex_unlock(&gc_queue_observer_lock);
d-netto marked this conversation as resolved.
Show resolved Hide resolved
return should_mark;
}

void gc_wake_all_for_marking(jl_ptls_t ptls)
{
jl_atomic_store(&gc_master_tid, ptls->tid);
uv_mutex_lock(&gc_threads_lock);
jl_atomic_fetch_add(&gc_n_threads_marking, 1);
uv_cond_broadcast(&gc_threads_cond);
uv_mutex_unlock(&gc_threads_lock);
}

void gc_mark_loop_parallel(jl_ptls_t ptls, int master)
{
int backoff = GC_BACKOFF_MIN;
if (master) {
jl_atomic_store(&gc_master_tid, ptls->tid);
// Wake threads up and try to do some work
uv_mutex_lock(&gc_threads_lock);
jl_atomic_fetch_add(&gc_n_threads_marking, 1);
uv_cond_broadcast(&gc_threads_cond);
uv_mutex_unlock(&gc_threads_lock);
gc_wake_all_for_marking(ptls);
gc_mark_and_steal(ptls);
jl_atomic_fetch_add(&gc_n_threads_marking, -1);
}
while (jl_atomic_load(&gc_n_threads_marking) > 0) {
// Try to become a thief while other threads are marking
jl_atomic_fetch_add(&gc_n_threads_marking, 1);
if (jl_atomic_load(&gc_master_tid) != -1) {
gc_mark_and_steal(ptls);
while (1) {
int should_mark = gc_should_mark(ptls);
if (!should_mark) {
break;
}
gc_mark_and_steal(ptls);
jl_atomic_fetch_add(&gc_n_threads_marking, -1);
// Failed to steal
gc_backoff(&backoff);
}
}

Expand Down Expand Up @@ -3728,6 +3812,7 @@ void jl_gc_init(void)
uv_mutex_init(&gc_threads_lock);
uv_cond_init(&gc_threads_cond);
uv_sem_init(&gc_sweep_assists_needed, 0);
uv_mutex_init(&gc_queue_observer_lock);

jl_gc_init_page();
jl_gc_debug_init();
Expand Down
15 changes: 1 addition & 14 deletions src/gc.h
Original file line number Diff line number Diff line change
Expand Up @@ -190,25 +190,12 @@ extern jl_gc_page_stack_t global_page_pool_lazily_freed;
extern jl_gc_page_stack_t global_page_pool_clean;
extern jl_gc_page_stack_t global_page_pool_freed;

#define GC_BACKOFF_MIN 4
#define GC_BACKOFF_MAX 12

STATIC_INLINE void gc_backoff(int *i) JL_NOTSAFEPOINT
{
if (*i < GC_BACKOFF_MAX) {
(*i)++;
}
for (int j = 0; j < (1 << *i); j++) {
jl_cpu_pause();
}
}

// Lock-free stack implementation taken
// from Herlihy's "The Art of Multiprocessor Programming"
// XXX: this is not a general-purpose lock-free stack. We can
// get away with just using a CAS and not implementing some ABA
// prevention mechanism since once a node is popped from the
// `jl_gc_global_page_pool_t`, it may only be pushed back to them
// `jl_gc_page_stack_t`, it may only be pushed back to them
// in the sweeping phase, which also doesn't push a node into the
// same stack after it's popped

Expand Down