From 550f865685c3517102f30dac4c3c1666005a9904 Mon Sep 17 00:00:00 2001
From: Jameson Nash <vtjnash@gmail.com>
Date: Mon, 25 Sep 2023 13:20:19 +0200
Subject: [PATCH] add support for async backtraces of Tasks on any thread
 (#51430)

---
 src/Makefile         |   2 +-
 src/gc-stacks.c      |  80 +++++---
 src/gc.c             |   8 +-
 src/interpreter.c    |   3 +-
 src/julia.h          |   5 +
 src/julia_internal.h |  13 +-
 src/julia_threads.h  |   8 +-
 src/mtarraylist.c    |  81 ++++++++
 src/signals-mach.c   |  29 ++-
 src/signals-unix.c   |  54 +++--
 src/signals-win.c    | 109 ++++++----
 src/stackwalk.c      | 470 +++++++++++++++++++++++--------------------
 src/threading.c      |   6 +-
 13 files changed, 536 insertions(+), 332 deletions(-)
 create mode 100644 src/mtarraylist.c

diff --git a/src/Makefile b/src/Makefile
index 51d8bf1f2de65a..42ff9aa3a9b361 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -43,7 +43,7 @@ endif
 SRCS := \
 	jltypes gf typemap smallintset ast builtins module interpreter symbol \
 	dlload sys init task array staticdata toplevel jl_uv datatype \
-	simplevector runtime_intrinsics precompile jloptions \
+	simplevector runtime_intrinsics precompile jloptions mtarraylist \
 	threading partr stackwalk gc gc-debug gc-pages gc-stacks gc-alloc-profiler method \
 	jlapi signal-handling safepoint timing subtype rtutils gc-heap-snapshot \
 	crc32c APInt-C processor ircode opaque_closure codegen-stubs coverage runtime_ccall
diff --git a/src/gc-stacks.c b/src/gc-stacks.c
index 20e283afb2b862..693cb8d0eadf01 100644
--- a/src/gc-stacks.c
+++ b/src/gc-stacks.c
@@ -119,7 +119,7 @@ static void _jl_free_stack(jl_ptls_t ptls, void *stkbuf, size_t bufsz)
     if (bufsz <= pool_sizes[JL_N_STACK_POOLS - 1]) {
         unsigned pool_id = select_pool(bufsz);
         if (pool_sizes[pool_id] == bufsz) {
-            arraylist_push(&ptls->heap.free_stacks[pool_id], stkbuf);
+            small_arraylist_push(&ptls->heap.free_stacks[pool_id], stkbuf);
             return;
         }
     }
@@ -148,7 +148,7 @@ void jl_release_task_stack(jl_ptls_t ptls, jl_task_t *task)
 #ifdef _COMPILER_ASAN_ENABLED_
             __asan_unpoison_stack_memory((uintptr_t)stkbuf, bufsz);
 #endif
-            arraylist_push(&ptls->heap.free_stacks[pool_id], stkbuf);
+            small_arraylist_push(&ptls->heap.free_stacks[pool_id], stkbuf);
         }
     }
 }
@@ -163,9 +163,9 @@ JL_DLLEXPORT void *jl_malloc_stack(size_t *bufsz, jl_task_t *owner) JL_NOTSAFEPO
     if (ssize <= pool_sizes[JL_N_STACK_POOLS - 1]) {
         unsigned pool_id = select_pool(ssize);
         ssize = pool_sizes[pool_id];
-        arraylist_t *pool = &ptls->heap.free_stacks[pool_id];
+        small_arraylist_t *pool = &ptls->heap.free_stacks[pool_id];
         if (pool->len > 0) {
-            stk = arraylist_pop(pool);
+            stk = small_arraylist_pop(pool);
         }
     }
     else {
@@ -184,8 +184,8 @@ JL_DLLEXPORT void *jl_malloc_stack(size_t *bufsz, jl_task_t *owner) JL_NOTSAFEPO
     }
     *bufsz = ssize;
     if (owner) {
-        arraylist_t *live_tasks = &ptls->heap.live_tasks;
-        arraylist_push(live_tasks, owner);
+        small_arraylist_t *live_tasks = &ptls->heap.live_tasks;
+        mtarraylist_push(live_tasks, owner);
     }
     return stk;
 }
@@ -209,7 +209,7 @@ void sweep_stack_pools(void)
 
         // free half of stacks that remain unused since last sweep
         for (int p = 0; p < JL_N_STACK_POOLS; p++) {
-            arraylist_t *al = &ptls2->heap.free_stacks[p];
+            small_arraylist_t *al = &ptls2->heap.free_stacks[p];
             size_t n_to_free;
             if (al->len > MIN_STACK_MAPPINGS_PER_POOL) {
                 n_to_free = al->len / 2;
@@ -220,12 +220,12 @@ void sweep_stack_pools(void)
                 n_to_free = 0;
             }
             for (int n = 0; n < n_to_free; n++) {
-                void *stk = arraylist_pop(al);
+                void *stk = small_arraylist_pop(al);
                 free_stack(stk, pool_sizes[p]);
             }
         }
 
-        arraylist_t *live_tasks = &ptls2->heap.live_tasks;
+        small_arraylist_t *live_tasks = &ptls2->heap.live_tasks;
         size_t n = 0;
         size_t ndel = 0;
         size_t l = live_tasks->len;
@@ -268,24 +268,52 @@ void sweep_stack_pools(void)
 
 JL_DLLEXPORT jl_array_t *jl_live_tasks(void)
 {
-    jl_task_t *ct = jl_current_task;
-    jl_ptls_t ptls = ct->ptls;
-    arraylist_t *live_tasks = &ptls->heap.live_tasks;
-    size_t i, j, l;
-    jl_array_t *a;
-    do {
-        l = live_tasks->len;
-        a = jl_alloc_vec_any(l + 1); // may gc, changing the number of tasks
-    } while (l + 1 < live_tasks->len);
-    l = live_tasks->len;
-    void **lst = live_tasks->items;
-    j = 0;
-    ((void**)jl_array_data(a))[j++] = ptls->root_task;
-    for (i = 0; i < l; i++) {
-        if (((jl_task_t*)lst[i])->stkbuf != NULL)
-            ((void**)jl_array_data(a))[j++] = lst[i];
+    size_t nthreads = jl_atomic_load_acquire(&jl_n_threads);
+    jl_ptls_t *allstates = jl_atomic_load_relaxed(&jl_all_tls_states);
+    size_t l = 0; // l is not reset on restart, so we keep getting more aggressive at making a big enough list everything it fails
+restart:
+    for (size_t i = 0; i < nthreads; i++) {
+        // skip GC threads since they don't have tasks
+        if (gc_first_tid <= i && i < gc_first_tid + jl_n_gcthreads) {
+            continue;
+        }
+        jl_ptls_t ptls2 = allstates[i];
+        if (ptls2 == NULL)
+            continue;
+        small_arraylist_t *live_tasks = &ptls2->heap.live_tasks;
+        size_t n = mtarraylist_length(live_tasks);
+        l += n + (ptls2->root_task->stkbuf != NULL);
+    }
+    l += l / 20; // add 5% for margin of estimation error
+    jl_array_t *a = jl_alloc_vec_any(l); // may gc, changing the number of tasks and forcing us to reload everything
+    nthreads = jl_atomic_load_acquire(&jl_n_threads);
+    allstates = jl_atomic_load_relaxed(&jl_all_tls_states);
+    size_t j = 0;
+    for (size_t i = 0; i < nthreads; i++) {
+        // skip GC threads since they don't have tasks
+        if (gc_first_tid <= i && i < gc_first_tid + jl_n_gcthreads) {
+            continue;
+        }
+        jl_ptls_t ptls2 = allstates[i];
+        if (ptls2 == NULL)
+            continue;
+        jl_task_t *t = ptls2->root_task;
+        if (t->stkbuf != NULL) {
+            if (j == l)
+                goto restart;
+            ((void**)jl_array_data(a))[j++] = t;
+        }
+        small_arraylist_t *live_tasks = &ptls2->heap.live_tasks;
+        size_t n = mtarraylist_length(live_tasks);
+        for (size_t i = 0; i < n; i++) {
+            jl_task_t *t = (jl_task_t*)mtarraylist_get(live_tasks, i);
+            if (t->stkbuf != NULL) {
+                if (j == l)
+                    goto restart;
+                ((void**)jl_array_data(a))[j++] = t;
+            }
+        }
     }
-    l = jl_array_len(a);
     if (j < l) {
         JL_GC_PUSH1(&a);
         jl_array_del_end(a, l - j);
diff --git a/src/gc.c b/src/gc.c
index d606f922906323..d9e5c7906cce32 100644
--- a/src/gc.c
+++ b/src/gc.c
@@ -1084,7 +1084,7 @@ JL_DLLEXPORT jl_weakref_t *jl_gc_new_weakref_th(jl_ptls_t ptls,
     jl_weakref_t *wr = (jl_weakref_t*)jl_gc_alloc(ptls, sizeof(void*),
                                                   jl_weakref_type);
     wr->value = value;  // NOTE: wb not needed here
-    arraylist_push(&ptls->heap.weak_refs, wr);
+    small_arraylist_push(&ptls->heap.weak_refs, wr);
     return wr;
 }
 
@@ -3679,8 +3679,10 @@ void jl_init_thread_heap(jl_ptls_t ptls)
         p[i].freelist = NULL;
         p[i].newpages = NULL;
     }
-    arraylist_new(&heap->weak_refs, 0);
-    arraylist_new(&heap->live_tasks, 0);
+    small_arraylist_new(&heap->weak_refs, 0);
+    small_arraylist_new(&heap->live_tasks, 0);
+    for (int i = 0; i < JL_N_STACK_POOLS; i++)
+        small_arraylist_new(&heap->free_stacks[i], 0);
     heap->mallocarrays = NULL;
     heap->mafreelist = NULL;
     heap->big_objects = NULL;
diff --git a/src/interpreter.c b/src/interpreter.c
index 1f9c416d99b1b1..6f546db9bbbb10 100644
--- a/src/interpreter.c
+++ b/src/interpreter.c
@@ -65,7 +65,8 @@ extern void JL_GC_ENABLEFRAME(interpreter_state*) JL_NOTSAFEPOINT;
 // we define this separately so that we can populate the frame before we add it to the backtrace
 // it's recommended to mark the containing function with NOINLINE, though not essential
 #define JL_GC_ENABLEFRAME(frame) \
-  ((void**)&frame[1])[0] = __builtin_frame_address(0);
+    jl_signal_fence(); \
+    ((void**)&frame[1])[0] = __builtin_frame_address(0);
 
 #endif
 
diff --git a/src/julia.h b/src/julia.h
index caebdf450ed758..bb747a77a518d7 100644
--- a/src/julia.h
+++ b/src/julia.h
@@ -968,6 +968,11 @@ JL_DLLEXPORT void *jl_gc_managed_realloc(void *d, size_t sz, size_t oldsz,
                                          int isaligned, jl_value_t *owner);
 JL_DLLEXPORT void jl_gc_safepoint(void);
 
+void *mtarraylist_get(small_arraylist_t *_a, size_t idx) JL_NOTSAFEPOINT;
+size_t mtarraylist_length(small_arraylist_t *_a) JL_NOTSAFEPOINT;
+void mtarraylist_add(small_arraylist_t *_a, void *elt, size_t idx) JL_NOTSAFEPOINT;
+void mtarraylist_push(small_arraylist_t *_a, void *elt) JL_NOTSAFEPOINT;
+
 // object accessors -----------------------------------------------------------
 
 #define jl_svec_len(t)              (((jl_svec_t*)(t))->length)
diff --git a/src/julia_internal.h b/src/julia_internal.h
index 9ecc97cc646833..149eebf21ada0f 100644
--- a/src/julia_internal.h
+++ b/src/julia_internal.h
@@ -195,10 +195,12 @@ JL_DLLEXPORT void jl_set_profile_peek_duration(double);
 
 JL_DLLEXPORT void jl_init_profile_lock(void);
 JL_DLLEXPORT uintptr_t jl_lock_profile_rd_held(void) JL_NOTSAFEPOINT;
-JL_DLLEXPORT void jl_lock_profile(void) JL_NOTSAFEPOINT;
-JL_DLLEXPORT void jl_unlock_profile(void) JL_NOTSAFEPOINT;
-JL_DLLEXPORT void jl_lock_profile_wr(void) JL_NOTSAFEPOINT;
-JL_DLLEXPORT void jl_unlock_profile_wr(void) JL_NOTSAFEPOINT;
+JL_DLLEXPORT void jl_lock_profile(void) JL_NOTSAFEPOINT JL_NOTSAFEPOINT_ENTER;
+JL_DLLEXPORT void jl_unlock_profile(void) JL_NOTSAFEPOINT JL_NOTSAFEPOINT_LEAVE;
+JL_DLLEXPORT void jl_lock_profile_wr(void) JL_NOTSAFEPOINT JL_NOTSAFEPOINT_ENTER;
+JL_DLLEXPORT void jl_unlock_profile_wr(void) JL_NOTSAFEPOINT JL_NOTSAFEPOINT_LEAVE;
+int jl_lock_stackwalk(void) JL_NOTSAFEPOINT JL_NOTSAFEPOINT_ENTER;
+void jl_unlock_stackwalk(int lockret) JL_NOTSAFEPOINT JL_NOTSAFEPOINT_LEAVE;
 
 // number of cycles since power-on
 static inline uint64_t cycleclock(void) JL_NOTSAFEPOINT
@@ -1181,6 +1183,9 @@ void jl_print_bt_entry_codeloc(int sig, jl_bt_element_t *bt_data) JL_NOTSAFEPOIN
 #ifdef _OS_WINDOWS_
 JL_DLLEXPORT void jl_refresh_dbg_module_list(void);
 #endif
+int jl_thread_suspend_and_get_state(int tid, int timeout, bt_context_t *ctx) JL_NOTSAFEPOINT;
+void jl_thread_resume(int tid) JL_NOTSAFEPOINT;
+
 // *to is NULL or malloc'd pointer, from is allowed to be NULL
 STATIC_INLINE char *jl_copy_str(char **to, const char *from) JL_NOTSAFEPOINT
 {
diff --git a/src/julia_threads.h b/src/julia_threads.h
index 97a45c356656bf..8570e069383d5e 100644
--- a/src/julia_threads.h
+++ b/src/julia_threads.h
@@ -107,7 +107,7 @@ typedef struct {
 
 // handle to reference an OS thread
 #ifdef _OS_WINDOWS_
-typedef DWORD jl_thread_t;
+typedef HANDLE jl_thread_t;
 #else
 typedef pthread_t jl_thread_t;
 #endif
@@ -138,10 +138,10 @@ typedef struct {
 
 typedef struct {
     // variable for tracking weak references
-    arraylist_t weak_refs;
+    small_arraylist_t weak_refs;
     // live tasks started on this thread
     // that are holding onto a stack from the pool
-    arraylist_t live_tasks;
+    small_arraylist_t live_tasks;
 
     // variables for tracking malloc'd arrays
     struct _mallocarray_t *mallocarrays;
@@ -169,7 +169,7 @@ typedef struct {
     jl_gc_pool_t norm_pools[JL_GC_N_POOLS];
 
 #define JL_N_STACK_POOLS 16
-    arraylist_t free_stacks[JL_N_STACK_POOLS];
+    small_arraylist_t free_stacks[JL_N_STACK_POOLS];
 } jl_thread_heap_t;
 
 typedef struct {
diff --git a/src/mtarraylist.c b/src/mtarraylist.c
new file mode 100644
index 00000000000000..8bad44797dab43
--- /dev/null
+++ b/src/mtarraylist.c
@@ -0,0 +1,81 @@
+// This file is a part of Julia. License is MIT: https://julialang.org/license
+
+#include "julia.h"
+#include "julia_internal.h"
+#include "julia_assert.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// this file provides some alternate API functions for small_arraylist (push and add)
+// which can be safely observed from other threads concurrently
+// there is only permitted to be a single writer thread (or a mutex)
+// but there can be any number of observers
+
+typedef struct {
+    _Atomic(uint32_t) len;
+    uint32_t max;
+    _Atomic(_Atomic(void*)*) items;
+    _Atomic(void*) _space[SMALL_AL_N_INLINE];
+} small_mtarraylist_t;
+
+// change capacity to at least newlen
+static void mtarraylist_resizeto(small_mtarraylist_t *a, size_t len, size_t newlen) JL_NOTSAFEPOINT
+{
+    size_t max = a->max;
+    if (newlen > max) {
+        size_t nm = max * 2;
+        if (nm == 0)
+            nm = 1;
+        while (newlen > nm)
+            nm *= 2;
+        void *olditems = (void*)jl_atomic_load_relaxed(&a->items);
+        void *p = calloc_s(nm * sizeof(void*));
+        memcpy(p, olditems, len * sizeof(void*));
+        jl_atomic_store_release(&a->items, (_Atomic(void*)*)p);
+        a->max = nm;
+        if (olditems != (void*)&a->_space[0]) {
+            jl_task_t *ct = jl_current_task;
+            jl_gc_add_quiescent(ct->ptls, (void**)olditems, free);
+        }
+    }
+}
+
+// single-threaded
+void mtarraylist_push(small_arraylist_t *_a, void *elt)
+{
+    small_mtarraylist_t *a = (small_mtarraylist_t*)_a;
+    size_t len = jl_atomic_load_relaxed(&a->len);
+    mtarraylist_resizeto(a, len, len + 1);
+    jl_atomic_store_release(&jl_atomic_load_relaxed(&a->items)[len], elt);
+    jl_atomic_store_release(&a->len, len + 1);
+}
+
+// single-threaded
+void mtarraylist_add(small_arraylist_t *_a, void *elt, size_t idx)
+{
+    small_mtarraylist_t *a = (small_mtarraylist_t*)_a;
+    size_t len = jl_atomic_load_relaxed(&a->len);
+    mtarraylist_resizeto(a, len, idx + 1);
+    jl_atomic_store_release(&jl_atomic_load_relaxed(&a->items)[idx], elt);
+    if (jl_atomic_load_relaxed(&a->len) < idx + 1)
+        jl_atomic_store_release(&a->len, idx + 1);
+}
+
+// concurrent-safe
+size_t mtarraylist_length(small_arraylist_t *_a)
+{
+    small_mtarraylist_t *a = (small_mtarraylist_t*)_a;
+    return jl_atomic_load_relaxed(&a->len);
+}
+
+// concurrent-safe
+void *mtarraylist_get(small_arraylist_t *_a, size_t idx)
+{
+    small_mtarraylist_t *a = (small_mtarraylist_t*)_a;
+    size_t len = jl_atomic_load_acquire(&a->len);
+    if (idx >= len)
+        return NULL;
+    return jl_atomic_load_relaxed(&jl_atomic_load_relaxed(&a->items)[idx]);
+}
diff --git a/src/signals-mach.c b/src/signals-mach.c
index 8ac7e5301d7ad5..32f0354cf17151 100644
--- a/src/signals-mach.c
+++ b/src/signals-mach.c
@@ -381,12 +381,12 @@ static void attach_exception_port(thread_port_t thread, int segv_only)
     HANDLE_MACH_ERROR("thread_set_exception_ports", ret);
 }
 
-static int jl_thread_suspend_and_get_state2(int tid, host_thread_state_t *ctx)
+static int jl_thread_suspend_and_get_state2(int tid, host_thread_state_t *ctx) JL_NOTSAFEPOINT
 {
     jl_ptls_t ptls2 = jl_atomic_load_relaxed(&jl_all_tls_states)[tid];
     if (ptls2 == NULL) // this thread is not alive
         return 0;
-    jl_task_t *ct2 = ptls2 ? jl_atomic_load_relaxed(&ptls2->current_task) : NULL;
+    jl_task_t *ct2 = jl_atomic_load_relaxed(&ptls2->current_task);
     if (ct2 == NULL) // this thread is already dead
         return 0;
 
@@ -404,18 +404,18 @@ static int jl_thread_suspend_and_get_state2(int tid, host_thread_state_t *ctx)
     return 1;
 }
 
-static void jl_thread_suspend_and_get_state(int tid, int timeout, unw_context_t **ctx)
+int jl_thread_suspend_and_get_state(int tid, int timeout, bt_context_t *ctx)
 {
     (void)timeout;
-    static host_thread_state_t state;
+    host_thread_state_t state;
     if (!jl_thread_suspend_and_get_state2(tid, &state)) {
-        *ctx = NULL;
-        return;
+        return 0;
     }
-    *ctx = (unw_context_t*)&state;
+    *ctx = *(unw_context_t*)&state;
+    return 1;
 }
 
-static void jl_thread_resume(int tid, int sig)
+void jl_thread_resume(int tid)
 {
     jl_ptls_t ptls2 = jl_atomic_load_relaxed(&jl_all_tls_states)[tid];
     mach_port_t thread = pthread_mach_thread_np(ptls2->system_id);
@@ -583,8 +583,15 @@ static void jl_unlock_profile_mach(int dlsymlock, int keymgr_locked)
     jl_unlock_profile();
 }
 
-#define jl_lock_profile()       int keymgr_locked = jl_lock_profile_mach(1)
-#define jl_unlock_profile()     jl_unlock_profile_mach(1, keymgr_locked)
+int jl_lock_stackwalk(void)
+{
+    return jl_lock_profile_mach(1);
+}
+
+void jl_unlock_stackwalk(int lockret)
+{
+    jl_unlock_profile_mach(1, lockret);
+}
 
 void *mach_profile_listener(void *arg)
 {
@@ -677,7 +684,7 @@ void *mach_profile_listener(void *arg)
                 bt_data_prof[bt_size_cur++].uintptr = 0;
             }
             // We're done! Resume the thread.
-            jl_thread_resume(i, 0);
+            jl_thread_resume(i);
         }
         jl_unlock_profile_mach(0, keymgr_locked);
         if (running) {
diff --git a/src/signals-unix.c b/src/signals-unix.c
index 165ef07abb3fe2..f38389913aa596 100644
--- a/src/signals-unix.c
+++ b/src/signals-unix.c
@@ -293,6 +293,18 @@ int exc_reg_is_write_fault(uintptr_t esr) {
 #include "signals-mach.c"
 #else
 
+int jl_lock_stackwalk(void)
+{
+    jl_lock_profile();
+    return 0;
+}
+
+void jl_unlock_stackwalk(int lockret)
+{
+    (void)lockret;
+    jl_unlock_profile();
+}
+
 
 #if defined(_OS_LINUX_) && (defined(_CPU_X86_64_) || defined(_CPU_X86_))
 int is_write_fault(void *context) {
@@ -386,12 +398,12 @@ JL_NO_ASAN static void segv_handler(int sig, siginfo_t *info, void *context)
 }
 
 #if !defined(JL_DISABLE_LIBUNWIND)
-static unw_context_t *signal_context;
+static bt_context_t *signal_context;
 pthread_mutex_t in_signal_lock;
 static pthread_cond_t exit_signal_cond;
 static pthread_cond_t signal_caught_cond;
 
-static void jl_thread_suspend_and_get_state(int tid, int timeout, unw_context_t **ctx)
+int jl_thread_suspend_and_get_state(int tid, int timeout, bt_context_t *ctx)
 {
     struct timespec ts;
     clock_gettime(CLOCK_REALTIME, &ts);
@@ -401,9 +413,8 @@ static void jl_thread_suspend_and_get_state(int tid, int timeout, unw_context_t
     jl_task_t *ct2 = ptls2 ? jl_atomic_load_relaxed(&ptls2->current_task) : NULL;
     if (ct2 == NULL) {
         // this thread is not alive or already dead
-        *ctx = NULL;
         pthread_mutex_unlock(&in_signal_lock);
-        return;
+        return 0;
     }
     jl_atomic_store_release(&ptls2->signal_request, 1);
     pthread_kill(ptls2->system_id, SIGUSR2);
@@ -412,9 +423,8 @@ static void jl_thread_suspend_and_get_state(int tid, int timeout, unw_context_t
     if (err == ETIMEDOUT) {
         sig_atomic_t request = 1;
         if (jl_atomic_cmpswap(&ptls2->signal_request, &request, 0)) {
-            *ctx = NULL;
             pthread_mutex_unlock(&in_signal_lock);
-            return;
+            return 0;
         }
         // Request is either now 0 (meaning the other thread is waiting for
         //   exit_signal_cond already),
@@ -431,15 +441,16 @@ static void jl_thread_suspend_and_get_state(int tid, int timeout, unw_context_t
     // checking it is 0, and add an acquire barrier for good measure)
     int request = jl_atomic_load_acquire(&ptls2->signal_request);
     assert(request == 0); (void) request;
-    *ctx = signal_context;
+    jl_atomic_store_release(&ptls2->signal_request, 1); // prepare to resume normally
+    *ctx = *signal_context;
+    return 1;
 }
 
-static void jl_thread_resume(int tid, int sig)
+void jl_thread_resume(int tid)
 {
     jl_ptls_t ptls2 = jl_atomic_load_relaxed(&jl_all_tls_states)[tid];
-    jl_atomic_store_release(&ptls2->signal_request, sig == -1 ? 3 : 1);
     pthread_cond_broadcast(&exit_signal_cond);
-    pthread_cond_wait(&signal_caught_cond, &in_signal_lock); // wait for thread to acknowledge
+    pthread_cond_wait(&signal_caught_cond, &in_signal_lock); // wait for thread to acknowledge (so that signal_request doesn't get mixed up)
     // The other thread is waiting to leave exit_signal_cond (verify that here by
     // checking it is 0, and add an acquire barrier for good measure)
     int request = jl_atomic_load_acquire(&ptls2->signal_request);
@@ -474,14 +485,14 @@ CFI_NORETURN
 static void jl_exit_thread0(int signo, jl_bt_element_t *bt_data, size_t bt_size)
 {
     jl_ptls_t ptls2 = jl_atomic_load_relaxed(&jl_all_tls_states)[0];
-    unw_context_t *signal_context;
+    bt_context_t signal_context;
     // This also makes sure `sleep` is aborted.
-    jl_thread_suspend_and_get_state(0, 30, &signal_context);
-    if (signal_context != NULL) {
+    if (jl_thread_suspend_and_get_state(0, 30, &signal_context)) {
         thread0_exit_signo = signo;
         ptls2->bt_size = bt_size; // <= JL_MAX_BT_SIZE
         memcpy(ptls2->bt_data, bt_data, ptls2->bt_size * sizeof(bt_data[0]));
-        jl_thread_resume(0, -1); // resume with message 3 (call jl_exit_thread0_cb)
+        jl_atomic_store_release(&ptls2->signal_request, 3);
+        jl_thread_resume(0); // resume with message 3 (call jl_exit_thread0_cb)
     }
     else {
         // thread 0 is gone? just do the exit ourself
@@ -877,11 +888,11 @@ static void *signal_listener(void *arg)
         int nthreads = jl_atomic_load_acquire(&jl_n_threads);
         bt_size = 0;
 #if !defined(JL_DISABLE_LIBUNWIND)
-        unw_context_t *signal_context;
+        bt_context_t signal_context;
         // sample each thread, round-robin style in reverse order
         // (so that thread zero gets notified last)
         if (critical || profile) {
-            jl_lock_profile();
+            int lockret = jl_lock_stackwalk();
             int *randperm;
             if (profile)
                  randperm = profile_get_randperm(nthreads);
@@ -889,8 +900,7 @@ static void *signal_listener(void *arg)
                 // Stop the threads in the random or reverse round-robin order.
                 int i = profile ? randperm[idx] : idx;
                 // notify thread to stop
-                jl_thread_suspend_and_get_state(i, 1, &signal_context);
-                if (signal_context == NULL)
+                if (!jl_thread_suspend_and_get_state(i, 1, &signal_context))
                     continue;
 
                 // do backtrace on thread contexts for critical signals
@@ -898,7 +908,7 @@ static void *signal_listener(void *arg)
                 if (critical) {
                     bt_size += rec_backtrace_ctx(bt_data + bt_size,
                             JL_MAX_BT_SIZE / nthreads - 1,
-                            signal_context, NULL);
+                            &signal_context, NULL);
                     bt_data[bt_size++].uintptr = 0;
                 }
 
@@ -920,7 +930,7 @@ static void *signal_listener(void *arg)
                         } else {
                             // Get backtrace data
                             bt_size_cur += rec_backtrace_ctx((jl_bt_element_t*)bt_data_prof + bt_size_cur,
-                                    bt_size_max - bt_size_cur - 1, signal_context, NULL);
+                                    bt_size_max - bt_size_cur - 1, &signal_context, NULL);
                         }
                         jl_set_safe_restore(old_buf);
 
@@ -945,9 +955,9 @@ static void *signal_listener(void *arg)
                 }
 
                 // notify thread to resume
-                jl_thread_resume(i, sig);
+                jl_thread_resume(i);
             }
-            jl_unlock_profile();
+            jl_unlock_stackwalk(lockret);
         }
 #ifndef HAVE_MACH
         if (profile && running) {
diff --git a/src/signals-win.c b/src/signals-win.c
index cca0af52ace53c..d1f83d6bfdcc46 100644
--- a/src/signals-win.c
+++ b/src/signals-win.c
@@ -344,6 +344,54 @@ JL_DLLEXPORT void jl_install_sigint_handler(void)
 
 static volatile HANDLE hBtThread = 0;
 
+int jl_thread_suspend_and_get_state(int tid, int timeout, bt_context_t *ctx)
+{
+    (void)timeout;
+    jl_ptls_t ptls2 = jl_atomic_load_relaxed(&jl_all_tls_states)[tid];
+    if (ptls2 == NULL) // this thread is not alive
+        return 0;
+    jl_task_t *ct2 = jl_atomic_load_relaxed(&ptls2->current_task);
+    if (ct2 == NULL) // this thread is already dead
+        return 0;
+    HANDLE hThread = ptls2->system_id;
+    if ((DWORD)-1 == SuspendThread(hThread))
+        return 0;
+    assert(sizeof(*ctx) == sizeof(CONTEXT));
+    memset(ctx, 0, sizeof(CONTEXT));
+    ctx->ContextFlags = CONTEXT_CONTROL | CONTEXT_INTEGER;
+    if (!GetThreadContext(hThread, ctx)) {
+        if ((DWORD)-1 == ResumeThread(hThread))
+            abort();
+        return 0;
+    }
+    return 1;
+}
+
+void jl_thread_resume(int tid)
+{
+    jl_ptls_t ptls2 = jl_atomic_load_relaxed(&jl_all_tls_states)[tid];
+    HANDLE hThread = ptls2->system_id;
+    if ((DWORD)-1 == ResumeThread(hThread)) {
+        fputs("failed to resume main thread! aborting.", stderr);
+        abort();
+    }
+}
+
+int jl_lock_stackwalk(void)
+{
+    uv_mutex_lock(&jl_in_stackwalk);
+    jl_lock_profile();
+    return 0;
+}
+
+void jl_unlock_stackwalk(int lockret)
+{
+    (void)lockret;
+    jl_unlock_profile();
+    uv_mutex_unlock(&jl_in_stackwalk);
+}
+
+
 static DWORD WINAPI profile_bt( LPVOID lparam )
 {
     // Note: illegal to use jl_* functions from this thread except for profiling-specific functions
@@ -357,58 +405,45 @@ static DWORD WINAPI profile_bt( LPVOID lparam )
                 continue;
             }
             else {
-                uv_mutex_lock(&jl_in_stackwalk);
-                jl_lock_profile();
-                if ((DWORD)-1 == SuspendThread(hMainThread)) {
-                    fputs("failed to suspend main thread. aborting profiling.", stderr);
-                    break;
-                }
+                // TODO: bring this up to parity with other OS by adding loop over tid here
+                int lockret = jl_lock_stackwalk();
                 CONTEXT ctxThread;
-                memset(&ctxThread, 0, sizeof(CONTEXT));
-                ctxThread.ContextFlags = CONTEXT_CONTROL | CONTEXT_INTEGER;
-                if (!GetThreadContext(hMainThread, &ctxThread)) {
-                    fputs("failed to get context from main thread. aborting profiling.", stderr);
+                if (!jl_thread_suspend_and_get_state(0, 0, &ctxThread)) {
+                    jl_unlock_stackwalk(lockret);
+                    fputs("failed to suspend main thread. aborting profiling.", stderr);
                     jl_profile_stop_timer();
+                    break;
                 }
-                else {
-                    // Get backtrace data
-                    bt_size_cur += rec_backtrace_ctx((jl_bt_element_t*)bt_data_prof + bt_size_cur,
-                            bt_size_max - bt_size_cur - 1, &ctxThread, NULL);
+                // Get backtrace data
+                bt_size_cur += rec_backtrace_ctx((jl_bt_element_t*)bt_data_prof + bt_size_cur,
+                        bt_size_max - bt_size_cur - 1, &ctxThread, NULL);
 
-                    jl_ptls_t ptls = jl_atomic_load_relaxed(&jl_all_tls_states)[0]; // given only profiling hMainThread
+                jl_ptls_t ptls = jl_atomic_load_relaxed(&jl_all_tls_states)[0]; // given only profiling hMainThread
 
-                    // store threadid but add 1 as 0 is preserved to indicate end of block
-                    bt_data_prof[bt_size_cur++].uintptr = ptls->tid + 1;
+                // store threadid but add 1 as 0 is preserved to indicate end of block
+                bt_data_prof[bt_size_cur++].uintptr = ptls->tid + 1;
 
-                    // store task id (never null)
-                    bt_data_prof[bt_size_cur++].jlvalue = (jl_value_t*)jl_atomic_load_relaxed(&ptls->current_task);
+                // store task id (never null)
+                bt_data_prof[bt_size_cur++].jlvalue = (jl_value_t*)jl_atomic_load_relaxed(&ptls->current_task);
 
-                    // store cpu cycle clock
-                    bt_data_prof[bt_size_cur++].uintptr = cycleclock();
+                // store cpu cycle clock
+                bt_data_prof[bt_size_cur++].uintptr = cycleclock();
 
-                    // store whether thread is sleeping but add 1 as 0 is preserved to indicate end of block
-                    bt_data_prof[bt_size_cur++].uintptr = jl_atomic_load_relaxed(&ptls->sleep_check_state) + 1;
+                // store whether thread is sleeping but add 1 as 0 is preserved to indicate end of block
+                bt_data_prof[bt_size_cur++].uintptr = jl_atomic_load_relaxed(&ptls->sleep_check_state) + 1;
 
-                    // Mark the end of this block with two 0's
-                    bt_data_prof[bt_size_cur++].uintptr = 0;
-                    bt_data_prof[bt_size_cur++].uintptr = 0;
-                }
-                jl_unlock_profile();
-                uv_mutex_unlock(&jl_in_stackwalk);
-                if ((DWORD)-1 == ResumeThread(hMainThread)) {
-                    jl_profile_stop_timer();
-                    fputs("failed to resume main thread! aborting.", stderr);
-                    jl_gc_debug_critical_error();
-                    abort();
-                }
+                // Mark the end of this block with two 0's
+                bt_data_prof[bt_size_cur++].uintptr = 0;
+                bt_data_prof[bt_size_cur++].uintptr = 0;
+                jl_unlock_stackwalk(lockret);
+                jl_thread_resume(0);
                 jl_check_profile_autostop();
             }
         }
     }
-    jl_unlock_profile();
     uv_mutex_unlock(&jl_in_stackwalk);
     jl_profile_stop_timer();
-    hBtThread = 0;
+    hBtThread = NULL;
     return 0;
 }
 
diff --git a/src/stackwalk.c b/src/stackwalk.c
index bd48b27b0cacd0..21a858e55b6944 100644
--- a/src/stackwalk.c
+++ b/src/stackwalk.c
@@ -865,7 +865,7 @@ _os_ptr_munge(uintptr_t ptr)
 
 extern bt_context_t *jl_to_bt_context(void *sigctx);
 
-void jl_rec_backtrace(jl_task_t *t) JL_NOTSAFEPOINT
+static void jl_rec_backtrace(jl_task_t *t) JL_NOTSAFEPOINT
 {
     jl_task_t *ct = jl_current_task;
     jl_ptls_t ptls = ct->ptls;
@@ -874,222 +874,242 @@ void jl_rec_backtrace(jl_task_t *t) JL_NOTSAFEPOINT
         ptls->bt_size = rec_backtrace(ptls->bt_data, JL_MAX_BT_SIZE, 0);
         return;
     }
-    if (t->copy_stack || !t->started || t->stkbuf == NULL)
-        return;
-    int16_t old = -1;
-    if (!jl_atomic_cmpswap(&t->tid, &old, ptls->tid) && old != ptls->tid)
-        return;
     bt_context_t *context = NULL;
-#if defined(_OS_WINDOWS_)
     bt_context_t c;
-    memset(&c, 0, sizeof(c));
-    _JUMP_BUFFER *mctx = (_JUMP_BUFFER*)&t->ctx.ctx.uc_mcontext;
+    int16_t old = -1;
+    while (!jl_atomic_cmpswap(&t->tid, &old, ptls->tid) && old != ptls->tid) {
+        int lockret = jl_lock_stackwalk();
+        // if this task is already running somewhere, we need to stop the thread it is running on and query its state
+        if (!jl_thread_suspend_and_get_state(old, 0, &c)) {
+            jl_unlock_stackwalk(lockret);
+            return;
+        }
+        jl_unlock_stackwalk(lockret);
+        if (jl_atomic_load_relaxed(&t->tid) == old) {
+            jl_ptls_t ptls2 = jl_atomic_load_relaxed(&jl_all_tls_states)[old];
+            if (ptls2->previous_task == t || // we might print the wrong stack here, since we can't know whether we executed the swapcontext yet or not, but it at least avoids trying to access the state inside uc_mcontext which might not be set yet
+                (ptls2->previous_task == NULL && jl_atomic_load_relaxed(&ptls2->current_task) == t)) { // this case should be always accurate
+                // use the thread context for the unwind state
+                context = &c;
+            }
+            break;
+        }
+        // got the wrong thread stopped, try again
+        jl_thread_resume(old);
+    }
+    if (context == NULL && (!t->copy_stack && t->started && t->stkbuf != NULL)) {
+        // need to read the context from the task stored state
+#if defined(_OS_WINDOWS_)
+        memset(&c, 0, sizeof(c));
+        _JUMP_BUFFER *mctx = (_JUMP_BUFFER*)&t->ctx.ctx.uc_mcontext;
 #if defined(_CPU_X86_64_)
-    c.Rbx = mctx->Rbx;
-    c.Rsp = mctx->Rsp;
-    c.Rbp = mctx->Rbp;
-    c.Rsi = mctx->Rsi;
-    c.Rdi = mctx->Rdi;
-    c.R12 = mctx->R12;
-    c.R13 = mctx->R13;
-    c.R14 = mctx->R14;
-    c.R15 = mctx->R15;
-    c.Rip = mctx->Rip;
-    memcpy(&c.Xmm6, &mctx->Xmm6, 10 * sizeof(mctx->Xmm6)); // Xmm6-Xmm15
+        c.Rbx = mctx->Rbx;
+        c.Rsp = mctx->Rsp;
+        c.Rbp = mctx->Rbp;
+        c.Rsi = mctx->Rsi;
+        c.Rdi = mctx->Rdi;
+        c.R12 = mctx->R12;
+        c.R13 = mctx->R13;
+        c.R14 = mctx->R14;
+        c.R15 = mctx->R15;
+        c.Rip = mctx->Rip;
+        memcpy(&c.Xmm6, &mctx->Xmm6, 10 * sizeof(mctx->Xmm6)); // Xmm6-Xmm15
 #else
-    c.Eip = mctx->Eip;
-    c.Esp = mctx->Esp;
-    c.Ebp = mctx->Ebp;
+        c.Eip = mctx->Eip;
+        c.Esp = mctx->Esp;
+        c.Ebp = mctx->Ebp;
 #endif
-    context = &c;
+        context = &c;
 #elif defined(JL_HAVE_UNW_CONTEXT)
-    context = &t->ctx.ctx;
+        context = &t->ctx.ctx;
 #elif defined(JL_HAVE_UCONTEXT)
-    context = jl_to_bt_context(&t->ctx.ctx);
+        context = jl_to_bt_context(&t->ctx.ctx);
 #elif defined(JL_HAVE_ASM)
-    bt_context_t c;
-    memset(&c, 0, sizeof(c));
- #if defined(_OS_LINUX_) && defined(__GLIBC__)
-    __jmp_buf *mctx = &t->ctx.ctx.uc_mcontext->__jmpbuf;
-    mcontext_t *mc = &c.uc_mcontext;
-  #if defined(_CPU_X86_)
-    // https://github.com/bminor/glibc/blame/master/sysdeps/i386/__longjmp.S
-    // https://github.com/bminor/glibc/blame/master/sysdeps/i386/jmpbuf-offsets.h
-    // https://github.com/bminor/musl/blame/master/src/setjmp/i386/longjmp.s
-    mc->gregs[REG_EBX] = (*mctx)[0];
-    mc->gregs[REG_ESI] = (*mctx)[1];
-    mc->gregs[REG_EDI] = (*mctx)[2];
-    mc->gregs[REG_EBP] = (*mctx)[3];
-    mc->gregs[REG_ESP] = (*mctx)[4];
-    mc->gregs[REG_EIP] = (*mctx)[5];
-    // ifdef PTR_DEMANGLE ?
-    mc->gregs[REG_ESP] = ptr_demangle(mc->gregs[REG_ESP]);
-    mc->gregs[REG_EIP] = ptr_demangle(mc->gregs[REG_EIP]);
-    context = &c;
-  #elif defined(_CPU_X86_64_)
-    // https://github.com/bminor/glibc/blame/master/sysdeps/x86_64/__longjmp.S
-    // https://github.com/bminor/glibc/blame/master/sysdeps/x86_64/jmpbuf-offsets.h
-    // https://github.com/bminor/musl/blame/master/src/setjmp/x86_64/setjmp.s
-    mc->gregs[REG_RBX] = (*mctx)[0];
-    mc->gregs[REG_RBP] = (*mctx)[1];
-    mc->gregs[REG_R12] = (*mctx)[2];
-    mc->gregs[REG_R13] = (*mctx)[3];
-    mc->gregs[REG_R14] = (*mctx)[4];
-    mc->gregs[REG_R15] = (*mctx)[5];
-    mc->gregs[REG_RSP] = (*mctx)[6];
-    mc->gregs[REG_RIP] = (*mctx)[7];
-    // ifdef PTR_DEMANGLE ?
-    mc->gregs[REG_RBP] = ptr_demangle(mc->gregs[REG_RBP]);
-    mc->gregs[REG_RSP] = ptr_demangle(mc->gregs[REG_RSP]);
-    mc->gregs[REG_RIP] = ptr_demangle(mc->gregs[REG_RIP]);
-    context = &c;
-  #elif defined(_CPU_ARM_)
-    // https://github.com/bminor/glibc/blame/master/sysdeps/arm/__longjmp.S
-    // https://github.com/bminor/glibc/blame/master/sysdeps/arm/include/bits/setjmp.h
-    // https://github.com/bminor/musl/blame/master/src/setjmp/arm/longjmp.S
-    mc->arm_sp = (*mctx)[0];
-    mc->arm_lr = (*mctx)[1];
-    mc->arm_r4 = (*mctx)[2]; // aka v1
-    mc->arm_r5 = (*mctx)[3]; // aka v2
-    mc->arm_r6 = (*mctx)[4]; // aka v3
-    mc->arm_r7 = (*mctx)[5]; // aka v4
-    mc->arm_r8 = (*mctx)[6]; // aka v5
-    mc->arm_r9 = (*mctx)[7]; // aka v6 aka sb
-    mc->arm_r10 = (*mctx)[8]; // aka v7 aka sl
-    mc->arm_fp = (*mctx)[10]; // aka v8 aka r11
-    // ifdef PTR_DEMANGLE ?
-    mc->arm_sp = ptr_demangle(mc->arm_sp);
-    mc->arm_lr = ptr_demangle(mc->arm_lr);
-    mc->arm_pc = mc->arm_lr;
-    context = &c;
-  #elif defined(_CPU_AARCH64_)
-    // https://github.com/bminor/glibc/blame/master/sysdeps/aarch64/__longjmp.S
-    // https://github.com/bminor/glibc/blame/master/sysdeps/aarch64/jmpbuf-offsets.h
-    // https://github.com/bminor/musl/blame/master/src/setjmp/aarch64/longjmp.s
-    // https://github.com/libunwind/libunwind/blob/ec171c9ba7ea3abb2a1383cee2988a7abd483a1f/src/aarch64/unwind_i.h#L62
-    unw_fpsimd_context_t *mcfp = (unw_fpsimd_context_t*)&mc->__reserved;
-    mc->regs[19] = (*mctx)[0];
-    mc->regs[20] = (*mctx)[1];
-    mc->regs[21] = (*mctx)[2];
-    mc->regs[22] = (*mctx)[3];
-    mc->regs[23] = (*mctx)[4];
-    mc->regs[24] = (*mctx)[5];
-    mc->regs[25] = (*mctx)[6];
-    mc->regs[26] = (*mctx)[7];
-    mc->regs[27] = (*mctx)[8];
-    mc->regs[28] = (*mctx)[9];
-    mc->regs[29] = (*mctx)[10]; // aka fp
-    mc->regs[30] = (*mctx)[11]; // aka lr
-    // Yes, they did skip 12 why writing the code originally; and, no, I do not know why.
-    mc->sp = (*mctx)[13];
-    mcfp->vregs[7] = (*mctx)[14]; // aka d8
-    mcfp->vregs[8] = (*mctx)[15]; // aka d9
-    mcfp->vregs[9] = (*mctx)[16]; // aka d10
-    mcfp->vregs[10] = (*mctx)[17]; // aka d11
-    mcfp->vregs[11] = (*mctx)[18]; // aka d12
-    mcfp->vregs[12] = (*mctx)[19]; // aka d13
-    mcfp->vregs[13] = (*mctx)[20]; // aka d14
-    mcfp->vregs[14] = (*mctx)[21]; // aka d15
-    // ifdef PTR_DEMANGLE ?
-    mc->sp = ptr_demangle(mc->sp);
-    mc->regs[30] = ptr_demangle(mc->regs[30]);
-    mc->pc = mc->regs[30];
-    context = &c;
-  #else
-   #pragma message("jl_rec_backtrace not defined for ASM/SETJMP on unknown linux")
-   (void)mc;
-   (void)c;
-   (void)mctx;
-  #endif
- #elif defined(_OS_DARWIN_)
-    sigjmp_buf *mctx = &t->ctx.ctx.uc_mcontext;
-  #if defined(_CPU_X86_64_)
-    // from https://github.com/apple/darwin-libplatform/blob/main/src/setjmp/x86_64/_setjmp.s
-    x86_thread_state64_t *mc = (x86_thread_state64_t*)&c;
-    mc->__rbx = ((uint64_t*)mctx)[0];
-    mc->__rbp = ((uint64_t*)mctx)[1];
-    mc->__rsp = ((uint64_t*)mctx)[2];
-    mc->__r12 = ((uint64_t*)mctx)[3];
-    mc->__r13 = ((uint64_t*)mctx)[4];
-    mc->__r14 = ((uint64_t*)mctx)[5];
-    mc->__r15 = ((uint64_t*)mctx)[6];
-    mc->__rip = ((uint64_t*)mctx)[7];
-    // added in libsystem_plaform 177.200.16 (macOS Mojave 10.14.3)
-    // prior to that _os_ptr_munge_token was (hopefully) typically 0,
-    // so x ^ 0 == x and this is a no-op
-    mc->__rbp = _OS_PTR_UNMUNGE(mc->__rbp);
-    mc->__rsp = _OS_PTR_UNMUNGE(mc->__rsp);
-    mc->__rip = _OS_PTR_UNMUNGE(mc->__rip);
-    context = &c;
-  #elif defined(_CPU_AARCH64_)
-    // from https://github.com/apple/darwin-libplatform/blob/main/src/setjmp/arm64/setjmp.s
-    // https://github.com/apple/darwin-xnu/blob/main/osfmk/mach/arm/_structs.h
-    // https://github.com/llvm/llvm-project/blob/7714e0317520207572168388f22012dd9e152e9e/libunwind/src/Registers.hpp -> Registers_arm64
-    arm_thread_state64_t *mc = (arm_thread_state64_t*)&c;
-    mc->__x[19] = ((uint64_t*)mctx)[0];
-    mc->__x[20] = ((uint64_t*)mctx)[1];
-    mc->__x[21] = ((uint64_t*)mctx)[2];
-    mc->__x[22] = ((uint64_t*)mctx)[3];
-    mc->__x[23] = ((uint64_t*)mctx)[4];
-    mc->__x[24] = ((uint64_t*)mctx)[5];
-    mc->__x[25] = ((uint64_t*)mctx)[6];
-    mc->__x[26] = ((uint64_t*)mctx)[7];
-    mc->__x[27] = ((uint64_t*)mctx)[8];
-    mc->__x[28] = ((uint64_t*)mctx)[9];
-    mc->__x[10] = ((uint64_t*)mctx)[10];
-    mc->__x[11] = ((uint64_t*)mctx)[11];
-    mc->__x[12] = ((uint64_t*)mctx)[12];
-    // 13 is reserved/unused
-    double *mcfp = (double*)&mc[1];
-    mcfp[7] = ((uint64_t*)mctx)[14]; // aka d8
-    mcfp[8] = ((uint64_t*)mctx)[15]; // aka d9
-    mcfp[9] = ((uint64_t*)mctx)[16]; // aka d10
-    mcfp[10] = ((uint64_t*)mctx)[17]; // aka d11
-    mcfp[11] = ((uint64_t*)mctx)[18]; // aka d12
-    mcfp[12] = ((uint64_t*)mctx)[19]; // aka d13
-    mcfp[13] = ((uint64_t*)mctx)[20]; // aka d14
-    mcfp[14] = ((uint64_t*)mctx)[21]; // aka d15
-    mc->__fp = _OS_PTR_UNMUNGE(mc->__x[10]);
-    mc->__lr = _OS_PTR_UNMUNGE(mc->__x[11]);
-    mc->__x[12] = _OS_PTR_UNMUNGE(mc->__x[12]);
-    mc->__sp = mc->__x[12];
-    // libunwind is broken for signed-pointers, but perhaps best not to leave the signed pointer lying around either
-    mc->__pc = ptrauth_strip(mc->__lr, 0);
-    mc->__pad = 0; // aka __ra_sign_state = not signed
-    context = &c;
-  #else
-   #pragma message("jl_rec_backtrace not defined for ASM/SETJMP on unknown darwin")
-    (void)mctx;
-    (void)c;
-  #endif
- #elif defined(_OS_FREEBSD_) && defined(_CPU_X86_64_)
-    sigjmp_buf *mctx = &t->ctx.ctx.uc_mcontext;
-    mcontext_t *mc = &c.uc_mcontext;
-    // https://github.com/freebsd/freebsd-src/blob/releng/13.1/lib/libc/amd64/gen/_setjmp.S
-    mc->mc_rip = ((long*)mctx)[0];
-    mc->mc_rbx = ((long*)mctx)[1];
-    mc->mc_rsp = ((long*)mctx)[2];
-    mc->mc_rbp = ((long*)mctx)[3];
-    mc->mc_r12 = ((long*)mctx)[4];
-    mc->mc_r13 = ((long*)mctx)[5];
-    mc->mc_r14 = ((long*)mctx)[6];
-    mc->mc_r15 = ((long*)mctx)[7];
-    context = &c;
- #else
-  #pragma message("jl_rec_backtrace not defined for ASM/SETJMP on unknown system")
-  (void)c;
- #endif
+        memset(&c, 0, sizeof(c));
+     #if defined(_OS_LINUX_) && defined(__GLIBC__)
+        __jmp_buf *mctx = &t->ctx.ctx.uc_mcontext->__jmpbuf;
+        mcontext_t *mc = &c.uc_mcontext;
+      #if defined(_CPU_X86_)
+        // https://github.com/bminor/glibc/blame/master/sysdeps/i386/__longjmp.S
+        // https://github.com/bminor/glibc/blame/master/sysdeps/i386/jmpbuf-offsets.h
+        // https://github.com/bminor/musl/blame/master/src/setjmp/i386/longjmp.s
+        mc->gregs[REG_EBX] = (*mctx)[0];
+        mc->gregs[REG_ESI] = (*mctx)[1];
+        mc->gregs[REG_EDI] = (*mctx)[2];
+        mc->gregs[REG_EBP] = (*mctx)[3];
+        mc->gregs[REG_ESP] = (*mctx)[4];
+        mc->gregs[REG_EIP] = (*mctx)[5];
+        // ifdef PTR_DEMANGLE ?
+        mc->gregs[REG_ESP] = ptr_demangle(mc->gregs[REG_ESP]);
+        mc->gregs[REG_EIP] = ptr_demangle(mc->gregs[REG_EIP]);
+        context = &c;
+      #elif defined(_CPU_X86_64_)
+        // https://github.com/bminor/glibc/blame/master/sysdeps/x86_64/__longjmp.S
+        // https://github.com/bminor/glibc/blame/master/sysdeps/x86_64/jmpbuf-offsets.h
+        // https://github.com/bminor/musl/blame/master/src/setjmp/x86_64/setjmp.s
+        mc->gregs[REG_RBX] = (*mctx)[0];
+        mc->gregs[REG_RBP] = (*mctx)[1];
+        mc->gregs[REG_R12] = (*mctx)[2];
+        mc->gregs[REG_R13] = (*mctx)[3];
+        mc->gregs[REG_R14] = (*mctx)[4];
+        mc->gregs[REG_R15] = (*mctx)[5];
+        mc->gregs[REG_RSP] = (*mctx)[6];
+        mc->gregs[REG_RIP] = (*mctx)[7];
+        // ifdef PTR_DEMANGLE ?
+        mc->gregs[REG_RBP] = ptr_demangle(mc->gregs[REG_RBP]);
+        mc->gregs[REG_RSP] = ptr_demangle(mc->gregs[REG_RSP]);
+        mc->gregs[REG_RIP] = ptr_demangle(mc->gregs[REG_RIP]);
+        context = &c;
+      #elif defined(_CPU_ARM_)
+        // https://github.com/bminor/glibc/blame/master/sysdeps/arm/__longjmp.S
+        // https://github.com/bminor/glibc/blame/master/sysdeps/arm/include/bits/setjmp.h
+        // https://github.com/bminor/musl/blame/master/src/setjmp/arm/longjmp.S
+        mc->arm_sp = (*mctx)[0];
+        mc->arm_lr = (*mctx)[1];
+        mc->arm_r4 = (*mctx)[2]; // aka v1
+        mc->arm_r5 = (*mctx)[3]; // aka v2
+        mc->arm_r6 = (*mctx)[4]; // aka v3
+        mc->arm_r7 = (*mctx)[5]; // aka v4
+        mc->arm_r8 = (*mctx)[6]; // aka v5
+        mc->arm_r9 = (*mctx)[7]; // aka v6 aka sb
+        mc->arm_r10 = (*mctx)[8]; // aka v7 aka sl
+        mc->arm_fp = (*mctx)[10]; // aka v8 aka r11
+        // ifdef PTR_DEMANGLE ?
+        mc->arm_sp = ptr_demangle(mc->arm_sp);
+        mc->arm_lr = ptr_demangle(mc->arm_lr);
+        mc->arm_pc = mc->arm_lr;
+        context = &c;
+      #elif defined(_CPU_AARCH64_)
+        // https://github.com/bminor/glibc/blame/master/sysdeps/aarch64/__longjmp.S
+        // https://github.com/bminor/glibc/blame/master/sysdeps/aarch64/jmpbuf-offsets.h
+        // https://github.com/bminor/musl/blame/master/src/setjmp/aarch64/longjmp.s
+        // https://github.com/libunwind/libunwind/blob/ec171c9ba7ea3abb2a1383cee2988a7abd483a1f/src/aarch64/unwind_i.h#L62
+        unw_fpsimd_context_t *mcfp = (unw_fpsimd_context_t*)&mc->__reserved;
+        mc->regs[19] = (*mctx)[0];
+        mc->regs[20] = (*mctx)[1];
+        mc->regs[21] = (*mctx)[2];
+        mc->regs[22] = (*mctx)[3];
+        mc->regs[23] = (*mctx)[4];
+        mc->regs[24] = (*mctx)[5];
+        mc->regs[25] = (*mctx)[6];
+        mc->regs[26] = (*mctx)[7];
+        mc->regs[27] = (*mctx)[8];
+        mc->regs[28] = (*mctx)[9];
+        mc->regs[29] = (*mctx)[10]; // aka fp
+        mc->regs[30] = (*mctx)[11]; // aka lr
+        // Yes, they did skip 12 why writing the code originally; and, no, I do not know why.
+        mc->sp = (*mctx)[13];
+        mcfp->vregs[7] = (*mctx)[14]; // aka d8
+        mcfp->vregs[8] = (*mctx)[15]; // aka d9
+        mcfp->vregs[9] = (*mctx)[16]; // aka d10
+        mcfp->vregs[10] = (*mctx)[17]; // aka d11
+        mcfp->vregs[11] = (*mctx)[18]; // aka d12
+        mcfp->vregs[12] = (*mctx)[19]; // aka d13
+        mcfp->vregs[13] = (*mctx)[20]; // aka d14
+        mcfp->vregs[14] = (*mctx)[21]; // aka d15
+        // ifdef PTR_DEMANGLE ?
+        mc->sp = ptr_demangle(mc->sp);
+        mc->regs[30] = ptr_demangle(mc->regs[30]);
+        mc->pc = mc->regs[30];
+        context = &c;
+      #else
+       #pragma message("jl_rec_backtrace not defined for ASM/SETJMP on unknown linux")
+       (void)mc;
+       (void)c;
+       (void)mctx;
+      #endif
+     #elif defined(_OS_DARWIN_)
+        sigjmp_buf *mctx = &t->ctx.ctx.uc_mcontext;
+      #if defined(_CPU_X86_64_)
+        // from https://github.com/apple/darwin-libplatform/blob/main/src/setjmp/x86_64/_setjmp.s
+        x86_thread_state64_t *mc = (x86_thread_state64_t*)&c;
+        mc->__rbx = ((uint64_t*)mctx)[0];
+        mc->__rbp = ((uint64_t*)mctx)[1];
+        mc->__rsp = ((uint64_t*)mctx)[2];
+        mc->__r12 = ((uint64_t*)mctx)[3];
+        mc->__r13 = ((uint64_t*)mctx)[4];
+        mc->__r14 = ((uint64_t*)mctx)[5];
+        mc->__r15 = ((uint64_t*)mctx)[6];
+        mc->__rip = ((uint64_t*)mctx)[7];
+        // added in libsystem_plaform 177.200.16 (macOS Mojave 10.14.3)
+        // prior to that _os_ptr_munge_token was (hopefully) typically 0,
+        // so x ^ 0 == x and this is a no-op
+        mc->__rbp = _OS_PTR_UNMUNGE(mc->__rbp);
+        mc->__rsp = _OS_PTR_UNMUNGE(mc->__rsp);
+        mc->__rip = _OS_PTR_UNMUNGE(mc->__rip);
+        context = &c;
+      #elif defined(_CPU_AARCH64_)
+        // from https://github.com/apple/darwin-libplatform/blob/main/src/setjmp/arm64/setjmp.s
+        // https://github.com/apple/darwin-xnu/blob/main/osfmk/mach/arm/_structs.h
+        // https://github.com/llvm/llvm-project/blob/7714e0317520207572168388f22012dd9e152e9e/libunwind/src/Registers.hpp -> Registers_arm64
+        arm_thread_state64_t *mc = (arm_thread_state64_t*)&c;
+        mc->__x[19] = ((uint64_t*)mctx)[0];
+        mc->__x[20] = ((uint64_t*)mctx)[1];
+        mc->__x[21] = ((uint64_t*)mctx)[2];
+        mc->__x[22] = ((uint64_t*)mctx)[3];
+        mc->__x[23] = ((uint64_t*)mctx)[4];
+        mc->__x[24] = ((uint64_t*)mctx)[5];
+        mc->__x[25] = ((uint64_t*)mctx)[6];
+        mc->__x[26] = ((uint64_t*)mctx)[7];
+        mc->__x[27] = ((uint64_t*)mctx)[8];
+        mc->__x[28] = ((uint64_t*)mctx)[9];
+        mc->__x[10] = ((uint64_t*)mctx)[10];
+        mc->__x[11] = ((uint64_t*)mctx)[11];
+        mc->__x[12] = ((uint64_t*)mctx)[12];
+        // 13 is reserved/unused
+        double *mcfp = (double*)&mc[1];
+        mcfp[7] = ((uint64_t*)mctx)[14]; // aka d8
+        mcfp[8] = ((uint64_t*)mctx)[15]; // aka d9
+        mcfp[9] = ((uint64_t*)mctx)[16]; // aka d10
+        mcfp[10] = ((uint64_t*)mctx)[17]; // aka d11
+        mcfp[11] = ((uint64_t*)mctx)[18]; // aka d12
+        mcfp[12] = ((uint64_t*)mctx)[19]; // aka d13
+        mcfp[13] = ((uint64_t*)mctx)[20]; // aka d14
+        mcfp[14] = ((uint64_t*)mctx)[21]; // aka d15
+        mc->__fp = _OS_PTR_UNMUNGE(mc->__x[10]);
+        mc->__lr = _OS_PTR_UNMUNGE(mc->__x[11]);
+        mc->__x[12] = _OS_PTR_UNMUNGE(mc->__x[12]);
+        mc->__sp = mc->__x[12];
+        // libunwind is broken for signed-pointers, but perhaps best not to leave the signed pointer lying around either
+        mc->__pc = ptrauth_strip(mc->__lr, 0);
+        mc->__pad = 0; // aka __ra_sign_state = not signed
+        context = &c;
+      #else
+       #pragma message("jl_rec_backtrace not defined for ASM/SETJMP on unknown darwin")
+        (void)mctx;
+        (void)c;
+      #endif
+     #elif defined(_OS_FREEBSD_) && defined(_CPU_X86_64_)
+        sigjmp_buf *mctx = &t->ctx.ctx.uc_mcontext;
+        mcontext_t *mc = &c.uc_mcontext;
+        // https://github.com/freebsd/freebsd-src/blob/releng/13.1/lib/libc/amd64/gen/_setjmp.S
+        mc->mc_rip = ((long*)mctx)[0];
+        mc->mc_rbx = ((long*)mctx)[1];
+        mc->mc_rsp = ((long*)mctx)[2];
+        mc->mc_rbp = ((long*)mctx)[3];
+        mc->mc_r12 = ((long*)mctx)[4];
+        mc->mc_r13 = ((long*)mctx)[5];
+        mc->mc_r14 = ((long*)mctx)[6];
+        mc->mc_r15 = ((long*)mctx)[7];
+        context = &c;
+     #else
+      #pragma message("jl_rec_backtrace not defined for ASM/SETJMP on unknown system")
+      (void)c;
+     #endif
 #elif defined(JL_HAVE_ASYNCIFY)
- #pragma message("jl_rec_backtrace not defined for ASYNCIFY")
+     #pragma message("jl_rec_backtrace not defined for ASYNCIFY")
 #elif defined(JL_HAVE_SIGALTSTACK)
- #pragma message("jl_rec_backtrace not defined for SIGALTSTACK")
+     #pragma message("jl_rec_backtrace not defined for SIGALTSTACK")
 #else
- #pragma message("jl_rec_backtrace not defined for unknown task system")
+     #pragma message("jl_rec_backtrace not defined for unknown task system")
 #endif
+    }
     if (context)
-        ptls->bt_size = rec_backtrace_ctx(ptls->bt_data, JL_MAX_BT_SIZE, context, t->gcstack);
+        ptls->bt_size = rec_backtrace_ctx(ptls->bt_data, JL_MAX_BT_SIZE, context,  t->gcstack);
     if (old == -1)
         jl_atomic_store_relaxed(&t->tid, old);
+    else if (old != ptls->tid)
+        jl_thread_resume(old);
 }
 
 //--------------------------------------------------
@@ -1118,7 +1138,7 @@ JL_DLLEXPORT void jlbacktrace(void) JL_NOTSAFEPOINT
     }
 }
 
-// Print backtrace for specified task
+// Print backtrace for specified task to jl_safe_printf stderr
 JL_DLLEXPORT void jlbacktracet(jl_task_t *t) JL_NOTSAFEPOINT
 {
     jl_task_t *ct = jl_current_task;
@@ -1138,9 +1158,7 @@ JL_DLLEXPORT void jl_print_backtrace(void) JL_NOTSAFEPOINT
 
 extern int gc_first_tid;
 
-// Print backtraces for all live tasks, for all threads.
-// WARNING: this is dangerous and can crash if used outside of gdb, if
-// all of Julia's threads are not stopped!
+// Print backtraces for all live tasks, for all threads, to jl_safe_printf stderr
 JL_DLLEXPORT void jl_print_task_backtraces(int show_done) JL_NOTSAFEPOINT
 {
     size_t nthreads = jl_atomic_load_acquire(&jl_n_threads);
@@ -1151,25 +1169,35 @@ JL_DLLEXPORT void jl_print_task_backtraces(int show_done) JL_NOTSAFEPOINT
             continue;
         }
         jl_ptls_t ptls2 = allstates[i];
-        arraylist_t *live_tasks = &ptls2->heap.live_tasks;
-        size_t n = live_tasks->len;
+        if (ptls2 == NULL)
+            continue;
+        small_arraylist_t *live_tasks = &ptls2->heap.live_tasks;
+        size_t n = mtarraylist_length(live_tasks);
+        jl_task_t *t = ptls2->root_task;
+        int t_state = jl_atomic_load_relaxed(&t->_state);
         jl_safe_printf("==== Thread %d created %zu live tasks\n",
-                ptls2->tid + 1, n + 1);
-        jl_safe_printf("     ---- Root task (%p)\n", ptls2->root_task);
-        jl_safe_printf("          (sticky: %d, started: %d, state: %d, tid: %d)\n",
-                ptls2->root_task->sticky, ptls2->root_task->started,
-                jl_atomic_load_relaxed(&ptls2->root_task->_state),
-                jl_atomic_load_relaxed(&ptls2->root_task->tid) + 1);
-        jlbacktracet(ptls2->root_task);
-
-        void **lst = live_tasks->items;
-        for (size_t j = 0; j < live_tasks->len; j++) {
-            jl_task_t *t = (jl_task_t *)lst[j];
+                ptls2->tid + 1, n + (t_state != JL_TASK_STATE_DONE));
+        if (show_done || t_state != JL_TASK_STATE_DONE) {
+            jl_safe_printf("     ---- Root task (%p)\n", ptls2->root_task);
+            jl_safe_printf("          (sticky: %d, started: %d, state: %d, tid: %d)\n",
+                    t->sticky, t->started, t_state,
+                    jl_atomic_load_relaxed(&t->tid) + 1);
+            if (t->stkbuf != NULL)
+                jlbacktracet(t);
+            else
+                jl_safe_printf("      no stack\n");
+            jl_safe_printf("     ---- End root task\n");
+        }
+
+        for (size_t j = 0; j < n; j++) {
+            jl_task_t *t = (jl_task_t*)mtarraylist_get(live_tasks, j);
+            if (t == NULL)
+                continue;
             int t_state = jl_atomic_load_relaxed(&t->_state);
-            if (!show_done && t_state == JL_TASK_STATE_DONE) {
+            if (!show_done && t_state == JL_TASK_STATE_DONE)
                 continue;
-            }
             jl_safe_printf("     ---- Task %zu (%p)\n", j + 1, t);
+            // n.b. this information might not be consistent with the stack printing after it, since it could start running or change tid, etc.
             jl_safe_printf("          (sticky: %d, started: %d, state: %d, tid: %d)\n",
                     t->sticky, t->started, t_state,
                     jl_atomic_load_relaxed(&t->tid) + 1);
diff --git a/src/threading.c b/src/threading.c
index 7ed8e3b6e7dc99..6b7e14d3b8db42 100644
--- a/src/threading.c
+++ b/src/threading.c
@@ -305,6 +305,8 @@ static uv_mutex_t tls_lock; // controls write-access to these variables:
 _Atomic(jl_ptls_t*) jl_all_tls_states JL_GLOBALLY_ROOTED;
 int jl_all_tls_states_size;
 static uv_cond_t cond;
+// concurrent reads are permitted, using the same pattern as mtsmall_arraylist
+// it is implemented separately because the API of direct jl_all_tls_states use is already widely prevalent
 
 // return calling thread's ID
 JL_DLLEXPORT int16_t jl_threadid(void)
@@ -338,7 +340,7 @@ jl_ptls_t jl_init_threadtls(int16_t tid)
 #ifndef _OS_WINDOWS_
     pthread_setspecific(jl_task_exit_key, (void*)ptls);
 #endif
-    ptls->system_id = (jl_thread_t)(uintptr_t)uv_thread_self();
+    ptls->system_id = uv_thread_self();
     ptls->rngseed = jl_rand();
     if (tid == 0)
         ptls->disable_gc = 1;
@@ -373,10 +375,10 @@ jl_ptls_t jl_init_threadtls(int16_t tid)
     uv_cond_init(&ptls->wake_signal);
 
     uv_mutex_lock(&tls_lock);
-    jl_ptls_t *allstates = jl_atomic_load_relaxed(&jl_all_tls_states);
     if (tid == -1)
         tid = jl_atomic_load_relaxed(&jl_n_threads);
     ptls->tid = tid;
+    jl_ptls_t *allstates = jl_atomic_load_relaxed(&jl_all_tls_states);
     if (jl_all_tls_states_size <= tid) {
         int i, newsize = jl_all_tls_states_size + tid + 2;
         jl_ptls_t *newpptls = (jl_ptls_t*)calloc(newsize, sizeof(jl_ptls_t));